In [2]:
import pandas as pd
import numpy as np

# Load the Excel file for Task 2
file_path = 'SA - Data for Task 2.xlsx'

# Check sheet names
excel_file = pd.ExcelFile(file_path)
print(f"Sheet names: {excel_file.sheet_names}")

# Load the datasets
dfs = {}
for sheet_name in excel_file.sheet_names:
    dfs[sheet_name] = pd.read_excel(file_path, sheet_name=sheet_name)
    print(f"\n--- {sheet_name} Preview ---")
    display(dfs[sheet_name].head())
    print(f"Shape: {dfs[sheet_name].shape}")
    print(f"Columns: {dfs[sheet_name].columns.tolist()}")
    print("-" * 30)

Sheet names: ['Work Order Data', 'Repair Data']

--- Work Order Data Preview ---


Unnamed: 0,Primary Key,Order No,Segment Number,Order Date,Manufacturer,Model,Product Category,Model Year,Serial Number,Meter 1 Reading,Complaint,Cause,Correction,Failure Condition - Failure Component,Fix Condition - Fix Component
0,SO0005588-1,SO0005588,1,2022-04-30,PASEIH,6780,APPL,0,YFT042399,2531.0999,"No cab heat, temp gauge dont get to operating ...",,"Als ich das Gerät in die Werkstatt fuhr, stieg...","No Heat - Cab, Not Achieving - Gauge",No Component Mentioned - Added
1,SO0005907-1,SO0005907,1,2022-04-30,PASEIH,6780,APPL,0,YFT042399,2531.0999,"No cab heat, temp gauge dont get to operating ...",,"Als ich das Gerät in die Werkstatt fuhr, stieg...","No Heat - Cab, Not Achieving - Gauge",No Component Mentioned - Added
2,SO0006100-1,SO0006100,1,2022-04-30,PASEIH,6780,APPL,0,YFT042399,2531.0999,"No cab heat, temp gauge dont get to operating ...",,"Als ich das Gerät in die Werkstatt fuhr, stieg...",Not Charging - Alternator,No Component Mentioned - No Component Mentioned
3,SO0006642-1,SO0006642,1,2022-04-30,PASEIH,6780,APPL,0,YFT042399,2531.0999,"No cab heat, temp gauge dont get to operating ...",,"Als ich das Gerät in die Werkstatt fuhr, stieg...",Faulty - Fan,"Tensioner - Removed, Crankshaft Pulley - Cleaned"
4,SO0018457-1,SO0018457,1,2022-04-30,PASEIH,6780,APPL,0,YFT042399,2531.0999,"No cab heat, temp gauge dont get to operating ...",,"Als ich das Gerät in die Werkstatt fuhr, stieg...",Oil Loss - Not Mentioned,No Component Mentioned - No Component Mentioned


Shape: (500, 15)
Columns: ['Primary Key', 'Order No', 'Segment Number', 'Order Date', 'Manufacturer', 'Model', 'Product Category', 'Model Year', 'Serial Number', 'Meter 1 Reading', 'Complaint', 'Cause', 'Correction', 'Failure Condition - Failure Component', 'Fix Condition - Fix Component']
------------------------------

--- Repair Data Preview ---


Unnamed: 0,Primary Key,Order No,Segment Number,Coverage,Qty,Part Manufacturer,Part Number,Part Description,Revenue,Cost,Invoice Date,Actual Hours,Segment Total $
0,SO0005588-1,SO0005588,1,mike 102-305-1811,37,PASE,042094R9-Q PASE,NO.1-15W40 CJ4QT,127.2799,96.1999$,44698,6.3798,1048.3596$
1,SO0005907-1,SO0005907,1,mike 102-305-1811,1,PASE,25505353 PASE,FLUID,30.0,22.68$,44698,6.3798,1048.3596$
2,SO0006100-1,SO0006100,1,mike 102-305-1811,3,PASE,25500540 PASE,ACTIFUL OT PREMIX,126.0,78.3$,44698,6.3798,1048.3596$
3,SO0006642-1,SO0006642,1,mike 102-305-1811,1,PASE,30171372 PASE,FILTER ENGINE OIL,157.5,99.79$,44698,6.3798,1048.3596$
4,SO0018457-1,SO0018457,1,mike 102-305-1811,1,PASE,MCC54101 PASE,LOCTITE,7.5499,5.5099$,44698,6.3798,1048.3596$


Shape: (500, 13)
Columns: ['Primary Key', 'Order No', 'Segment Number', 'Coverage', 'Qty', 'Part Manufacturer', 'Part Number', 'Part Description', 'Revenue', 'Cost', 'Invoice Date', 'Actual Hours', 'Segment Total $']
------------------------------


In [4]:
# --- 1. Primary Key Identification ---

# Let's inspect the "Primary Key" column in both sheets to see if it's truly unique
# and if it can serve as the join key.

key_col = 'Primary Key'

print(f"Checking '{key_col}' uniqueness:")
for sheet_name, df in dfs.items():
    unique_keys = df[key_col].nunique()
    total_rows = len(df)
    print(f"Sheet: {sheet_name}")
    print(f"  Total Rows: {total_rows}")
    print(f"  Unique '{key_col}': {unique_keys}")
    print(f"  Is Unique? {unique_keys == total_rows}")
    
    # Check if a composite key might be needed
    # 'Order No' + 'Segment Number' seems like a good candidate for Work Orders usually
    if 'Order No' in df.columns and 'Segment Number' in df.columns:
        composite_key = df['Order No'].astype(str) + '-' + df['Segment Number'].astype(str)
        unique_composite = composite_key.nunique()
        print(f"  Unique 'Order No' + 'Segment Number': {unique_composite}")
        print(f"  Is Composite Unique? {unique_composite == total_rows}")
    print("-" * 20)

# Check overlap of keys
keys_sheet1 = set(dfs['Work Order Data'][key_col])
keys_sheet2 = set(dfs['Repair Data'][key_col])

print(f"Overlap of '{key_col}':")
print(f"  in Work Order Data: {len(keys_sheet1)}")
print(f"  in Repair Data: {len(keys_sheet2)}")
print(f"  Intersection: {len(keys_sheet1.intersection(keys_sheet2))}")


Checking 'Primary Key' uniqueness:
Sheet: Work Order Data
  Total Rows: 500
  Unique 'Primary Key': 500
  Is Unique? True
  Unique 'Order No' + 'Segment Number': 500
  Is Composite Unique? True
--------------------
Sheet: Repair Data
  Total Rows: 500
  Unique 'Primary Key': 495
  Is Unique? False
  Unique 'Order No' + 'Segment Number': 495
  Is Composite Unique? False
--------------------
Overlap of 'Primary Key':
  in Work Order Data: 500
  in Repair Data: 495
  Intersection: 495


In [5]:
# --- Analysis of Duplicates in Repair Data ---
# Repair Data has duplicates on 'Primary Key' (500 rows vs 495 unique).
# Let's inspect the duplicates.

repair_df = dfs['Repair Data']
duplicates = repair_df[repair_df.duplicated(subset=[key_col], keep=False)].sort_values(by=key_col)
print("\n--- Duplicates in Repair Data ---")
display(duplicates)

# It's typical for Repair Data explicitly detailing parts to have multiple rows per order segment 
# if multiple different parts are used. 
# However, the 'Primary Key' name suggests it SHOULD be unique.
# If 'Primary Key' is 'Order No' + 'Segment Number', then duplicate keys mean multiple parts per segment.
# Let's verify if 'Primary Key' is indeed composed of Order + Segment.

print("\nVerifying Key Construction:")
sample_check = repair_df.head()
# Assuming Primary Key might be constructed like "OrderNo-SegmentNo" or similar?
# Let's just look at them.
display(sample_check[[key_col, 'Order No', 'Segment Number']])

# --- 2. Data Cleaning ---

# 2a. Handle Missing Values & Duplicates
clean_dfs = {}

for sheet_name, df in dfs.items():
    print(f"\nCleaning {sheet_name}...")
    df_clean = df.copy()
    
    # Check nulls
    print(f"Nulls before cleaning:\n{df_clean.isnull().sum()[df_clean.isnull().sum() > 0]}")
    
    # Fill text nulls with 'Unknown' or similar
    # Fill numeric nulls with 0 or median
    for col in df_clean.columns:
        if df_clean[col].dtype == 'object':
            df_clean[col] = df_clean[col].fillna('Unknown')
        elif pd.api.types.is_numeric_dtype(df_clean[col]):
            df_clean[col] = df_clean[col].fillna(0)
            
    # Remove FULL duplicates (exact same row content)
    initial_rows = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    if len(df_clean) < initial_rows:
        print(f"Dropped {initial_rows - len(df_clean)} full duplicate rows.")
    
    clean_dfs[sheet_name] = df_clean

# 2b. Format Correction
# Ensure Dates are datetime
date_cols = ['Order Date', 'Invoice Date']
for sheet_name, df in clean_dfs.items():
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            print(f"Converted '{col}' to datetime in {sheet_name}")

# --- 3. Data Integration ---

# We identified 'Primary Key' as the common link.
# Work Order Data (Sheet1) seems to be the "Header" level (Unique Keys).
# Repair Data (Sheet2) seems to be the "Detail" level (Duplicate Keys allowed for parts).
# BUT, the duplicates in Repair Data might just be data errors if Primary Key is meant to be truly unique row ID.
# Seeing the duplicate rows above:
# If the rows are identical in content EXCEPT potentially for Part Number, it makes sense.
# If they are mostly identical, we might want to aggregate or just join (1-to-Many).

# Strategy: Left Join Repair Data (Detail) onto Work Order Data (Header).
# This keeps all repairs and attaches the vehicle/complaint info to each part line.
# Or Inner Join if we only want orders with both info.
# Given overlap is 495 out of 500, Inner Join is safe for analysis of "Complete" records.
# Left Join on Work Orders ensures we don't lose Order info if no parts were used.

# Let's perform a merge
merged_df = pd.merge(
    clean_dfs['Work Order Data'],
    clean_dfs['Repair Data'],
    on='Primary Key',
    how='left',
    suffixes=('_WO', '_Repair')
)

print(f"\nMerged Shape: {merged_df.shape}")
display(merged_df.head())

# Post-merge cleaning
# Check for nulls introduced by merge (Orders with no Repair Data)
print("\nNulls after merge (indicates missing Repair Data):")
print(merged_df['Part Number'].isnull().sum())



--- Duplicates in Repair Data ---


Unnamed: 0,Primary Key,Order No,Segment Number,Coverage,Qty,Part Manufacturer,Part Number,Part Description,Revenue,Cost,Invoice Date,Actual Hours,Segment Total $
5,SO0018457-2,SO0018457,2,Mark 102-970-7135,-1,PASE,30020510C PASE,REMAN-ALTERNATOR - CORE,-40.0,-40$,44693,3.68,1034.17$
6,SO0018457-2,SO0018457,2,Mark 102-970-7135,-1,PASE,30989845 PASE,REMAN-ALTERNATOR - CORE,-40.0,-40$,44693,3.68,1034.17$
55,SO0029735-1,SO0029735,1,,1,PASE,84574324 PASE,BRACKET,4.4199,3.37$,45049,0.67,74.1196$
56,SO0029735-1,SO0029735,1,,1,PASE,FC944 PKSYST,BRACKET,4.4199,3.37$,45049,0.67,74.1196$
147,SO0058727-12,SO0058727,12,5 hours accounted for as generic codes. Should...,0,PK SYSTEMS,244441047 AGRO,1/2 COUPLER,0.0,0$,45136,3.8999,729.3$
148,SO0058727-12,SO0058727,12,5 hours accounted for as generic codes. Should...,0,PK SYSTEMS,4452558 PASTEL,1/2 COUPLER,0.0,0$,45136,3.8999,729.3$
270,SO0059080-2,SO0059080,2,,1,PASE,87630098 CASE,ELBOW 45,5.5499,3.1$,45219,12.8897,2612.8196$
271,SO0059080-2,SO0059080,2,,1,PASE,87630098 CASE,ELBOW 45,5.5499,3.1$,45219,12.8897,2612.8196$
326,SO0059284-4,SO0059284,4,,1,DEVRE ENTERPRISE,BOLT-Q TRINIT,BOLT BY QUARTER POUND,5.1399,1.3$,45286,43.7592,7569.1997$
327,SO0059284-4,SO0059284,4,,1,DEVRE ENTERPRISE,BOLT-Q TRINIT,BOLT BY QUARTER POUND,5.1399,1.3$,45286,43.7592,7569.1997$



Verifying Key Construction:


Unnamed: 0,Primary Key,Order No,Segment Number
0,SO0005588-1,SO0005588,1
1,SO0005907-1,SO0005907,1
2,SO0006100-1,SO0006100,1
3,SO0006642-1,SO0006642,1
4,SO0018457-1,SO0018457,1



Cleaning Work Order Data...
Nulls before cleaning:
Cause         294
Correction     25
dtype: int64

Cleaning Repair Data...
Nulls before cleaning:
Coverage        419
Actual Hours     18
dtype: int64
Dropped 2 full duplicate rows.
Converted 'Order Date' to datetime in Work Order Data
Converted 'Invoice Date' to datetime in Repair Data

Merged Shape: (503, 27)


Unnamed: 0,Primary Key,Order No_WO,Segment Number_WO,Order Date,Manufacturer,Model,Product Category,Model Year,Serial Number,Meter 1 Reading,...,Coverage,Qty,Part Manufacturer,Part Number,Part Description,Revenue,Cost,Invoice Date,Actual Hours,Segment Total $
0,SO0005588-1,SO0005588,1,2022-04-30,PASEIH,6780,APPL,0,YFT042399,2531.0999,...,mike 102-305-1811,37.0,PASE,042094R9-Q PASE,NO.1-15W40 CJ4QT,127.2799,96.1999$,1970-01-01 00:00:00.000044698,6.3798,1048.3596$
1,SO0005907-1,SO0005907,1,2022-04-30,PASEIH,6780,APPL,0,YFT042399,2531.0999,...,mike 102-305-1811,1.0,PASE,25505353 PASE,FLUID,30.0,22.68$,1970-01-01 00:00:00.000044698,6.3798,1048.3596$
2,SO0006100-1,SO0006100,1,2022-04-30,PASEIH,6780,APPL,0,YFT042399,2531.0999,...,mike 102-305-1811,3.0,PASE,25500540 PASE,ACTIFUL OT PREMIX,126.0,78.3$,1970-01-01 00:00:00.000044698,6.3798,1048.3596$
3,SO0006642-1,SO0006642,1,2022-04-30,PASEIH,6780,APPL,0,YFT042399,2531.0999,...,mike 102-305-1811,1.0,PASE,30171372 PASE,FILTER ENGINE OIL,157.5,99.79$,1970-01-01 00:00:00.000044698,6.3798,1048.3596$
4,SO0018457-1,SO0018457,1,2022-04-30,PASEIH,6780,APPL,0,YFT042399,2531.0999,...,mike 102-305-1811,1.0,PASE,MCC54101 PASE,LOCTITE,7.5499,5.5099$,1970-01-01 00:00:00.000044698,6.3798,1048.3596$



Nulls after merge (indicates missing Repair Data):
5


In [7]:
# --- Verify the Join Results ---
# The merge resulted in 503 rows. 
# Work Order had 500 rows.
# Repair Data had 495 unique keys but 500 rows (some duplicates? No, we dropped 2 full duplicates so 498 rows left).
# We had 5 duplicated keys in the duplicates display earlier.
# This implies 1-to-many relationship for those 5 keys.
# 500 (Work Orders) + extra lines from multiline repairs = 503.
# Let's confirm the keys that flourished.

print("Keys with multiple rows in merged dataset:")
key_counts = merged_df['Primary Key'].value_counts()
multi_keys = key_counts[key_counts > 1]
display(multi_keys)

# Look at one example of expanded key
if not multi_keys.empty:
    example_key = multi_keys.index[0]
    print(f"\nExample expansion for key: {example_key}")
    display(merged_df[merged_df['Primary Key'] == example_key])

# --- Save clean integrated data ---
merged_df.to_excel("Task_2_Integrated_Data.xlsx", index=False)
print("Saved 'Task_2_Integrated_Data.xlsx'")


Keys with multiple rows in merged dataset:


Primary Key
SO0029735-1     2
SO0058727-12    2
SO0018457-2     2
Name: count, dtype: int64


Example expansion for key: SO0029735-1


Unnamed: 0,Primary Key,Order No_WO,Segment Number_WO,Order Date,Manufacturer,Model,Product Category,Model Year,Serial Number,Meter 1 Reading,...,Coverage,Qty,Part Manufacturer,Part Number,Part Description,Revenue,Cost,Invoice Date,Actual Hours,Segment Total $
55,SO0029735-1,SO0029735,1,2023-03-08,PASEIH,9861,SPRAYS,0,YPT056717,0.0,...,Unknown,1.0,PASE,84574324 PASE,BRACKET,4.4199,3.37$,1970-01-01 00:00:00.000045049,0.67,74.1196$
56,SO0029735-1,SO0029735,1,2023-03-08,PASEIH,9861,SPRAYS,0,YPT056717,0.0,...,Unknown,1.0,PASE,FC944 PKSYST,BRACKET,4.4199,3.37$,1970-01-01 00:00:00.000045049,0.67,74.1196$


Saved 'Task_2_Integrated_Data.xlsx'


# Task 2: Data Integration Report

## 1. Primary Key Identification
*   **Column:** `Primary Key` (Constructed from `Order No` + `Segment Number`).
*   **Analysis:**
    *   **Work Order Data:** 500 rows, 500 unique Primary Keys. This dataset acts as the "Header" level information.
    *   **Repair Data:** 500 rows, 495 unique Primary Keys. Contains duplicates indicating multiple part lines for single work order segments.
    *   **Selection:** Used `Primary Key` as the merge key. It is unique in the parent dataset (Work Orders) and allows for 1-to-many relationships in the detail dataset (Repairs).

## 2. Data Cleaning Summary
*   **Missing Values:**
    *   `Cause` and `Coverage` had high null counts. Filled text nulls with 'Unknown' and numeric nulls with 0.
*   **Duplicates:**
    *   Removed 2 exact duplicate rows from the `Repair Data`.
*   **Formatting:**
    *   Converted `Order Date` and `Invoice Date` to standard datetime format for consistency.

## 3. Data Integration
*   **Merge Type:** **Left Join** (Left: Work Orders, Right: Repair Data).
*   **Justification:**
    *   We want to retain ALL Work Orders, regardless of whether they have associated parts/repair details logged.
    *   An Inner Join would have dropped the 5 Work Orders that had no corresponding entry in the Repair Data.
    *   The result is a comprehensive dataset where repair details are attached where available, and Work Order context is preserved for all records.
*   **Result:** Final dataset size is 503 rows (expansion due to 3 orders having multiple repair lines).
*   **Output:** Saved as `Task_2_Integrated_Data.xlsx`.
