In [1]:
import pandas as pd
import re
import os  

In [2]:
# Function to standardize occupation names
def standardize_occupation_name(name: str) -> str:
    if pd.isna(name):
        return name  # Handle missing values
    return name.strip().lower().replace("-", " ").replace("_", " ")

In [3]:
# List of file paths categorized by time period
file_paths_2015_2019 = [
    "2_updated_median_weekly_income/2015_merged.xlsx",
    "2_updated_median_weekly_income/2016_merged.xlsx",
    "2_updated_median_weekly_income/2017_merged.xlsx",
    "2_updated_median_weekly_income/2018_merged.xlsx",
    "2_updated_median_weekly_income/2019_merged.xlsx",
]

file_paths_2020_2024 = [
    "2_updated_median_weekly_income/2020_merged.xlsx",
    "2_updated_median_weekly_income/2021_merged.xlsx",
    "2_updated_median_weekly_income/2022_merged.xlsx",
    "2_updated_median_weekly_income/2023_merged.xlsx",
    "2_updated_median_weekly_income/2024_merged.xlsx",
]

# Section 1: Process Income Data and Prepare to merge
1. Separate Merging for 2015-2019 & 2020-2024 and check if occupation names are consistent arocss years -> 2015-2019 & 2020-2024
2. check whether occupation names of automation level matches with 2018 crosswalk or 2019 crosswalk -> 2019 crosswalk
3. summary of assigning the code to occupations names, where 2015-2019(2010 code) & 2020-2024(2018 code)

### 1. Separate Merging for 2015-2019 & 2020-2024 and check if occupation names are consistent arocss years

In [4]:
# Function to standardize occupation names
def standardize_occupation_name(name: str) -> str:
    if pd.isna(name):
        return name  # Handle missing values
    return name.strip().lower().replace("-", " ").replace("_", " ")

# Function to process and merge files for a given time period
def process_and_merge_files(file_paths, output_filename):
    dfs = []  # Store DataFrames for merging
    unique_occupations = {}  # Track unique occupations per year

    for file in file_paths:
        year = os.path.basename(file)[:4]  # Extract year from filename

        try:
            df = pd.read_excel(file, dtype=str)  # Read file as string to preserve formatting

            # Identify the "Occupation" column dynamically
            occupation_col = next((col for col in df.columns if "occupation" in col.lower()), None)

            if occupation_col:
                df[occupation_col] = df[occupation_col].apply(standardize_occupation_name)  # Standardize names
                unique_occupations[year] = set(df[occupation_col].dropna().astype(str))  # Store unique occupations
            else:
                print(f"‚ö† Warning: No 'Occupation' column found in {file}")
                unique_occupations[year] = set()

            df['Year'] = year  # Assign year column
            dfs.append(df)  # Append processed dataframe

        except Exception as e:
            print(f"‚ùå Error processing {file}: {e}")

    # Merge all data for this time period
    if dfs:
        merged_df = pd.concat(dfs, ignore_index=True)

        # Save merged data
        os.makedirs("output", exist_ok=True)  # Ensure output directory exists
        output_path = f"output/{output_filename}"
        merged_df.to_excel(output_path, index=False)

        print(f"Merged file saved to: {output_path}")

        # Compare unique occupations across years
        common_occupations = set.intersection(*unique_occupations.values()) if unique_occupations else set()
        unique_counts = {year: len(occupations) for year, occupations in unique_occupations.items()}

        # Display results
        print(f"\nüîπ Unique occupation counts per year in {output_filename}:")
        print(unique_counts)
        print(f"üîπ Total common occupations across all years: {len(common_occupations)}")

        # Check if all occupation sets are identical across years
        occupation_sets = list(unique_occupations.values())
        all_consistent = all(occupation_sets[0] == occ_set for occ_set in occupation_sets) if occupation_sets else False

        if all_consistent:
            print(f"‚úÖ All occupation values are **consistent** across {output_filename}.")
        else:
            print(f"‚ö† Occupation values **vary** across different years in {output_filename}.")

        return merged_df
    else:
        print(f"‚ö† No valid data found for {output_filename}.")
        return None

# Process and merge the two time periods separately
merged_2015_2019 = process_and_merge_files(file_paths_2015_2019, "merged_weekly_earning_2015_2019.xlsx")
merged_2020_2024 = process_and_merge_files(file_paths_2020_2024, "merged_weekly_earning_2020_2024.xlsx")

Merged file saved to: output/merged_weekly_earning_2015_2019.xlsx

üîπ Unique occupation counts per year in merged_weekly_earning_2015_2019.xlsx:
{'2015': 567, '2016': 567, '2017': 567, '2018': 567, '2019': 567}
üîπ Total common occupations across all years: 567
‚úÖ All occupation values are **consistent** across merged_weekly_earning_2015_2019.xlsx.
Merged file saved to: output/merged_weekly_earning_2020_2024.xlsx

üîπ Unique occupation counts per year in merged_weekly_earning_2020_2024.xlsx:
{'2020': 597, '2021': 597, '2022': 597, '2023': 597, '2024': 597}
üîπ Total common occupations across all years: 597
‚úÖ All occupation values are **consistent** across merged_weekly_earning_2020_2024.xlsx.


occupations disappeared

In [5]:
# # Function to extract unique occupations from a given file
# def get_unique_occupations(file_path):
#     if not os.path.exists(file_path):
#         print(f"Error: File not found - {file_path}")
#         return set()
    
#     try:
#         df = pd.read_excel(file_path)
#         # Adjust column name if necessary (check for variations)
#         occupation_column = None
#         for col in df.columns:
#             if "occupation" in col.lower().strip():  # Flexible check
#                 occupation_column = col
#                 break

#         if occupation_column:
#             # Standardize occupations before storing them in a set
#             return set(df[occupation_column].dropna().apply(standardize_occupation_name).unique())
#         else:
#             print(f"Warning: No suitable occupation column found in {file_path}")
#             return set()
    
#     except Exception as e:
#         print(f"Error processing {file_path}: {e}")
#         return set()

# # Get standardized unique occupations for 2015 (earliest) and 2024 (latest)
# occupations_2015 = get_unique_occupations(file_paths_2015_2019[0])  # First file (2015)
# occupations_2024 = get_unique_occupations(file_paths_2020_2024[-1])  # Last file (2024)

# # Find occupations that disappeared (present in 2015 but not in 2024)
# disappeared_occupations = occupations_2015 - occupations_2024

# # Convert to DataFrame for easier export/display
# disappeared_df = pd.DataFrame(sorted(disappeared_occupations), columns=["Disappeared Occupations"])

# # Save output to an Excel file
# output_file = "disappeared_occupations.xlsx"
# disappeared_df.to_excel(output_file, index=False)

# # Display results
# print(f"Occupations that disappeared from 2015 to 2024 saved to: {output_file}")
# print(disappeared_df)

### 2. check whether occupation names of automation level matches with 2018 crosswalk or 2019 crosswalk -> 2019 crosswalk


In [6]:
crosswalk_2019ONET_to_SOC= pd.read_excel("SOC/2019_to_SOC_Crosswalk.xlsx", dtype=str)
ONET_Degree_of_Automation = pd.read_excel("ONET_Degree_of_Automation.xlsx", dtype=str)
crosswalk_2010_SOC = pd.read_excel("SOC/2010-census-occupation-classification-titles-and-code-list.xlsx", dtype=str)
crosswalk_2018_SOC = pd.read_excel("SOC/2018-census-occupation-classification-titles-and-code-list.xlsx", dtype=str)
soc_2010_to_2018_crosswalk = pd.read_excel("SOC/soc_2010_to_2018_crosswalk.xlsx", dtype=str)
income_2015_2019 = pd.read_excel("output/merged_weekly_earning_2015_2019.xlsx", dtype=str)
income_2020_2024 = pd.read_excel("output/merged_weekly_earning_2015_2019.xlsx", dtype=str)

In [7]:
crosswalk_2019ONET_to_SOC.columns

Index(['O*NET-SOC 2019 Code', 'O*NET-SOC 2019 Title', '2018 SOC Code',
       '2018 SOC Title'],
      dtype='object')

In [8]:
# Extract relevant columns for matching
onet_soc_codes_2019 = set(crosswalk_2019ONET_to_SOC["O*NET-SOC 2019 Code"].astype(str))
soc_2018_codes = set(crosswalk_2019ONET_to_SOC["2018 SOC Code"].astype(str))
degree_automation_codes = set(ONET_Degree_of_Automation["Code"].astype(str))

# Compute matches for both cases
onet_soc_matches = onet_soc_codes_2019.intersection(degree_automation_codes)
soc_2018_matches = soc_2018_codes.intersection(degree_automation_codes)

# Count the matches
onet_soc_match_count = len(onet_soc_matches)
soc_2018_match_count = len(soc_2018_matches)

# Return the counts
onet_soc_match_count, soc_2018_match_count

(879, 0)

The O*NET-SOC 2019 Code matches significantly more with the codes from the degree of automation dataset (879 matches), whereas the 2018 SOC Code has no matches. This suggests that the O*NET-SOC 2019 Code is more aligned with the format used in the degree of automation dataset.¬†Ôøº

### 3. summary of crosswalk of 2015-2019 and 2020-2024

#### Comparison Results (2015-2019 Median Income Data vs. 2010 Census Classification):
1. Unique occupations from median income files (2015-2019): 565
2. Unique occupations from the 2010 census classification list: 564
3. Matching occupations between both datasets: 551
4. Non-matching occupations (found in 2015-2019 but not in the 2010 census list): 14

This indicates that the vast majority of occupations (551 out of 565, or ~97.5%) align between the two sources. However, 14 occupations are present in the 2015-2019 median income files but not in the 2010 census classification list.

#### Comparison Results (2020-2024 Median Income Data vs. 2018 Census Classification):
1. Unique occupations from median income files (2020-2024): 596
2. Unique occupations from the 2018 census classification list: 595
3. Matching occupations between both datasets: 594
4. Non-matching occupations (found in 2020-2024 but not in the 2018 census list): 2

This suggests that while the vast majority (594 out of 596) of occupations align between the two sources, 2 occupations are present in 2020-2024 but not in the 2018 census classification.

# Section 2 Merge

# **Workflow for Matching Income Data, SOC Codes, and ONET Data**

This workflow ensures that **income data from 2015 to 2024** is correctly mapped to the **2010 SOC, 2018 SOC classification**, and then further crosswalked to **2019 ONET-SOC** to integrate with the **Level of Automation dataset**.

---

## **Step 1: Merge 2015-2019 Income Data with 2010 SOC**
- **Reads** the merged income dataset for **2015-2019**.
- **Loads** the **2010 SOC classification file**.
- **Performs an outer join** on **Occupation Title** and **Occupation** to retain all data.
- **Saves the output file as**:
  - üìÑ `"output/step1_income_2015_2019_SOC2010.xlsx"`

### **Outcome:**
‚úÖ **Merged income data (2015-2019) now contains 2010 SOC codes.**

---

## **Step 2: Crosswalk 2015-2019 Data from 2010 SOC to 2018 SOC**
- **Uses the output file from Step 1** (`step1_income_2015_2019_SOC2010.xlsx`).
- **Loads the SOC 2010 to 2018 crosswalk file**.
- **Performs an outer join** on **2010 SOC Code** and **2010 SOC Title**.
- **Saves the output file as**:
  - üìÑ `"output/step2_income_2015_2019_SOC2018.xlsx"`

### **Outcome:**
‚úÖ **Income data (2015-2019) now mapped to 2018 SOC codes.**

---

## **Step 3: Merge 2020-2024 Income Data with 2018 SOC**
- **Reads** the merged income dataset for **2020-2024**.
- **Loads** the **2018 SOC classification file**.
- **Performs an outer join** on **Occupation Title** and **Occupation** to retain all data.
- **Saves the output file as**:
  - üìÑ `"output/step3_income_2020_2024_SOC2018.xlsx"`

### **Outcome:**
‚úÖ **Merged income data (2020-2024) now contains 2018 SOC codes.**

---

## **Step 4: Crosswalk 2018 SOC to 2019 ONET-SOC for Both Time Periods**
- **Uses the output files from Step 2 and Step 3**.
- **Loads the 2019 ONET-SOC crosswalk file**.
- **Performs an outer join** on **2018 SOC Code** and **2018 SOC Title**.
- **Saves two separate output files**:
  - üìÑ `"output/step4_income_2015_2019_SOC2018_ONET2019.xlsx"`
  - üìÑ `"output/step4_income_2020_2024_SOC2018_ONET2019.xlsx"`

### **Outcome:**
‚úÖ **Both income datasets (2015-2019 and 2020-2024) are now mapped to 2019 ONET-SOC and linked to automation data.**

---

## **Step 5: Merge 2019 ONET-SOC Data with Degree of Automation**
- **Uses the output files from Step 4**:
  - üìÑ `"output/step4_income_2015_2019_SOC2018_ONET2019.xlsx"`
  - üìÑ `"output/step4_income_2020_2024_SOC2018_ONET2019.xlsx"`
- **Loads the Degree of Automation dataset**.
- **Performs an outer join** on **O*NET-SOC 2019 Code** and **O*NET-SOC 2019 Title**.
- **Saves two separate output files**:
  - üìÑ `"output/step5_income_2015_2019_SOC2018_automation.xlsx"`
  - üìÑ `"output/step5_income_2020_2024_SOC2018_automation.xlsx"`

### **Outcome:**
‚úÖ **Income data (2015-2019 and 2020-2024) now merged with the Degree of Automation dataset.**

---

## **Step 6: Concatenate Final Income Data**
- **Reads the final automation-linked income data files**:
  - üìÑ `"output/step5_income_2015_2019_SOC2018_automation.xlsx"`
  - üìÑ `"output/step5_income_2020_2024_SOC2018_automation.xlsx"`
- **Concatenates both datasets into a single file**.
- **Saves the final dataset as**:
  - üìÑ `"output/final_merged_income_automation_2015_2024.xlsx"`

### **Outcome:**
‚úÖ **A single merged dataset containing all income data (2015-2024) mapped to SOC, ONET-SOC, and automation levels.**

---

## **Final Deliverables**
1. **`output/step1_income_2015_2019_SOC2010.xlsx`** ‚Üí Mapped **2015-2019** income data to **2010 SOC**.
2. **`output/step2_income_2015_2019_SOC2018.xlsx`** ‚Üí Crosswalked **2015-2019** to **2018 SOC**.
3. **`output/step3_income_2020_2024_SOC2018.xlsx`** ‚Üí Mapped **2020-2024** income data to **2018 SOC**.
4. **`output/step4_income_2015_2019_SOC2018_ONET2019.xlsx`** ‚Üí Linked **2015-2019** income data to **automation exposure**.
5. **`output/step4_income_2020_2024_SOC2018_ONET2019.xlsx`** ‚Üí Linked **2020-2024** income data to **automation exposure**.
6. **`output/step5_income_2015_2019_SOC2018_automation.xlsx`** ‚Üí Final **2015-2019** income data merged with the Degree of Automation dataset.
7. **`output/step5_income_2020_2024_SOC2018_automation.xlsx`** ‚Üí Final **2020-2024** income data merged with the Degree of Automation dataset.
8. **`output/final_merged_income_automation_2015_2024.xlsx`** ‚Üí **Final fully merged dataset.**

---

In [9]:
# Ensure output directory exists
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

# Load datasets
crosswalk_2019ONET_to_SOC = pd.read_excel("SOC/2019_to_SOC_Crosswalk.xlsx", dtype=str)
ONET_Degree_of_Automation = pd.read_excel("ONET_Degree_of_Automation.xlsx", dtype=str)
crosswalk_2010_SOC = pd.read_excel("SOC/2010-census-occupation-classification-titles-and-code-list.xlsx", dtype=str)
crosswalk_2018_SOC = pd.read_excel("SOC/2018-census-occupation-classification-titles-and-code-list.xlsx", dtype=str)
soc_2010_to_2018_crosswalk = pd.read_excel("SOC/soc_2010_to_2018_crosswalk.xlsx", dtype=str)
income_2015_2019 = pd.read_excel("output/merged_weekly_earning_2015_2019.xlsx", dtype=str)
income_2020_2024 = pd.read_excel("output/merged_weekly_earning_2020_2024.xlsx", dtype=str)

# ------------------------------
# üõ† Fix: Standardize Column Names for Consistency
# ------------------------------
income_2015_2019 = income_2015_2019.rename(columns={"occupation_title": "Occupation title"})
income_2020_2024 = income_2020_2024.rename(columns={"occupation_title": "Occupation title"})
crosswalk_2010_SOC = crosswalk_2010_SOC.rename(columns={"OCCUPATION TITLE": "Occupation title"})
crosswalk_2018_SOC = crosswalk_2018_SOC.rename(columns={"Occupation title": "Occupation title"})

# Apply standardization function to all relevant columns
occupation_columns = ["Occupation title", "occupation", "Occupation", "occupation name"]

for col in occupation_columns:
    if col in income_2015_2019.columns:
        income_2015_2019[col] = income_2015_2019[col].apply(standardize_occupation_name)
    if col in income_2020_2024.columns:
        income_2020_2024[col] = income_2020_2024[col].apply(standardize_occupation_name)
    if col in crosswalk_2010_SOC.columns:
        crosswalk_2010_SOC[col] = crosswalk_2010_SOC[col].apply(standardize_occupation_name)
    if col in crosswalk_2018_SOC.columns:
        crosswalk_2018_SOC[col] = crosswalk_2018_SOC[col].apply(standardize_occupation_name)


In [10]:
# ------------------------------
# Step 1: Merge 2015-2019 Income Data with 2010 SOC -> all is matched
# ------------------------------
income_2015_2019_SOC2010 = income_2015_2019.merge(
    crosswalk_2010_SOC, how="outer", on="Occupation title", suffixes=("", "_2010SOC")
)
step1_output = f"{output_dir}/step1_income_2015_2019_SOC2010.xlsx"
income_2015_2019_SOC2010.to_excel(step1_output, index=False)


# ------------------------------
# PRINT OUTPUT PATHS FOR REFERENCE
# ------------------------------
print(f"‚úÖ Step 1 Output: {step1_output}")

‚úÖ Step 1 Output: output/step1_income_2015_2019_SOC2010.xlsx


In [11]:
# ------------------------------
# Step 2: Crosswalk 2015-2019 Data from 2010 SOC to 2018 SOC (One-to-Many Expansion)
# ------------------------------

# Ensure both SOC columns are strings and strip whitespace
income_2015_2019_SOC2010["2010 SOC CODE(S)"] = income_2015_2019_SOC2010["2010 SOC CODE(S)"].astype(str).str.strip()
soc_2010_to_2018_crosswalk["2010 SOC Code"] = soc_2010_to_2018_crosswalk["2010 SOC Code"].astype(str).str.strip()
soc_2010_to_2018_crosswalk["2018 SOC Code"] = soc_2010_to_2018_crosswalk["2018 SOC Code"].astype(str).str.strip()

# Perform an expanded merge where one 2010 SOC code maps to multiple 2018 SOC codes
income_2015_2019_SOC2018 = income_2015_2019_SOC2010.merge(
    soc_2010_to_2018_crosswalk, 
    how="left",  # Left join to keep all income data
    left_on="2010 SOC CODE(S)", 
    right_on="2010 SOC Code",
    suffixes=("", "_2018SOC")
)

# Save the updated dataset
step2_output = f"{output_dir}/step2_income_2015_2019_SOC2018.xlsx"
income_2015_2019_SOC2018.to_excel(step2_output, index=False)

# ------------------------------
# Step 3: Merge 2020-2024 Income Data with 2018 SOC -> almost all is matched
# ------------------------------
income_2020_2024_SOC2018 = income_2020_2024.merge(
    crosswalk_2018_SOC, how="outer", on="Occupation title", suffixes=("", "_2018SOC")
)
step3_output = f"{output_dir}/step3_income_2020_2024_SOC2018.xlsx"
income_2020_2024_SOC2018.to_excel(step3_output, index=False)

# ------------------------------
# PRINT OUTPUT PATHS FOR REFERENCE
# ------------------------------
print(f"‚úÖ Step 2 Output: {step2_output}")
print(f"‚úÖ Step 3 Output: {step3_output}")

‚úÖ Step 2 Output: output/step2_income_2015_2019_SOC2018.xlsx
‚úÖ Step 3 Output: output/step3_income_2020_2024_SOC2018.xlsx


In [12]:
# ------------------------------
# Step 4: Crosswalk 2018 SOC to 2019 ONET-SOC for Both Time Periods 
# ------------------------------

# Standardize column name in 2020-2024 dataset
income_2020_2024_SOC2018 = income_2020_2024_SOC2018.rename(columns={"2018 SOC code": "2018 SOC Code"})

# Ensure both SOC columns are strings and strip whitespace
income_2015_2019_SOC2018["2018 SOC Code"] = income_2015_2019_SOC2018["2018 SOC Code"].astype(str).str.strip()
income_2020_2024_SOC2018["2018 SOC Code"] = income_2020_2024_SOC2018["2018 SOC Code"].astype(str).str.strip()
crosswalk_2019ONET_to_SOC["2018 SOC Code"] = crosswalk_2019ONET_to_SOC["2018 SOC Code"].astype(str).str.strip()

# Merge 2015-2019 Data with ONET
income_2015_2019_SOC2018_ONET2019 = income_2015_2019_SOC2018.merge(
    crosswalk_2019ONET_to_SOC, 
    how="outer", 
    on="2018 SOC Code",
    suffixes=("", "_ONET2019")
)
step4_output_2015_2019 = f"{output_dir}/step4_income_2015_2019_SOC2018_ONET2019.xlsx"
income_2015_2019_SOC2018_ONET2019.to_excel(step4_output_2015_2019, index=False)

# Merge 2020-2024 Data with ONET
income_2020_2024_SOC2018_ONET2019 = income_2020_2024_SOC2018.merge(
    crosswalk_2019ONET_to_SOC, 
    how="outer", 
    on="2018 SOC Code",
    suffixes=("", "_ONET2019")
)
step4_output_2020_2024 = f"{output_dir}/step4_income_2020_2024_SOC2018_ONET2019.xlsx"
income_2020_2024_SOC2018_ONET2019.to_excel(step4_output_2020_2024, index=False)

# ------------------------------
# PRINT OUTPUT PATHS FOR REFERENCE
# ------------------------------
print(f"‚úÖ Step 4 Output (2015-2019): {step4_output_2015_2019}")
print(f"‚úÖ Step 4 Output (2020-2024): {step4_output_2020_2024}")

‚úÖ Step 4 Output (2015-2019): output/step4_income_2015_2019_SOC2018_ONET2019.xlsx
‚úÖ Step 4 Output (2020-2024): output/step4_income_2020_2024_SOC2018_ONET2019.xlsx


In [13]:
# ------------------------------
# Step 5: Merge Step 4 Results with Degree of Automation Data
# ------------------------------
# Load Step 4 datasets
step4_income_2015_2019_SOC2018_ONET2019 = pd.read_excel(step4_output_2015_2019, dtype=str)
step4_income_2020_2024_SOC2018_ONET2019 = pd.read_excel(step4_output_2020_2024, dtype=str)

# Merge 2015-2019 Data with Degree of Automation
income_2015_2019_SOC2018_automation = step4_income_2015_2019_SOC2018_ONET2019.merge(
    ONET_Degree_of_Automation,
    how="outer",
    left_on=["O*NET-SOC 2019 Code", "O*NET-SOC 2019 Title"],
    right_on=["Code", "Occupation"],
    suffixes=("", "_automation")
)

# Save the final merged file for 2015-2019
step5_output_2015_2019 = f"{output_dir}/step5_income_2015_2019_SOC2018_automation.xlsx"
income_2015_2019_SOC2018_automation.to_excel(step5_output_2015_2019, index=False)

# Merge 2020-2024 Data with Degree of Automation
income_2020_2024_SOC2018_automation = step4_income_2020_2024_SOC2018_ONET2019.merge(
    ONET_Degree_of_Automation,
    how="outer",
    left_on=["O*NET-SOC 2019 Code", "O*NET-SOC 2019 Title"],
    right_on=["Code", "Occupation"],
    suffixes=("", "_automation")
)

# Save the final merged file for 2020-2024
step5_output_2020_2024 = f"{output_dir}/step5_income_2020_2024_SOC2018_automation.xlsx"
income_2020_2024_SOC2018_automation.to_excel(step5_output_2020_2024, index=False)
print(f"‚úÖ Step 5 Output (2015-2019): {step5_output_2015_2019}")
print(f"‚úÖ Step 5 Output (2020-2024): {step5_output_2020_2024}")


‚úÖ Step 5 Output (2015-2019): output/step5_income_2015_2019_SOC2018_automation.xlsx
‚úÖ Step 5 Output (2020-2024): output/step5_income_2020_2024_SOC2018_automation.xlsx


In [14]:
import pandas as pd

# ------------------------------
# Load datasets
# ------------------------------
try:
    step1_income_2015_2019_SOC2010 = pd.read_excel("output/step1_income_2015_2019_SOC2010.xlsx", dtype=str)
    step2_income_2015_2019_SOC2018 = pd.read_excel("output/step2_income_2015_2019_SOC2018.xlsx", dtype=str)
    step3_income_2020_2024_SOC2018 = pd.read_excel("output/step3_income_2020_2024_SOC2018.xlsx", dtype=str)
except FileNotFoundError as e:
    print(f"‚ùå Error: {e}")
    exit()

# Fix column name inconsistency
step3_income_2020_2024_SOC2018 = step3_income_2020_2024_SOC2018.rename(columns={"2018 SOC code": "2018 SOC Code"})

# ------------------------------
# Step 1: Check match percentage for 2015-2019 to 2010 SOC
# ------------------------------
step1_total_occupations = len(step1_income_2015_2019_SOC2010)
step1_matched_occupations = step1_income_2015_2019_SOC2010["2010 SOC CODE(S)"].notna().sum()
step1_match_percentage = (step1_matched_occupations / step1_total_occupations) * 100

# ------------------------------
# Step 2: Check match percentage for 2010 SOC to 2018 SOC
# ------------------------------
step2_total_occupations = len(step2_income_2015_2019_SOC2018)
step2_matched_occupations = step2_income_2015_2019_SOC2018["2018 SOC Code"].notna().sum()
step2_match_percentage = (step2_matched_occupations / step2_total_occupations) * 100

# ------------------------------
# Step 3: Check match percentage for 2020-2024 to 2018 SOC
# ------------------------------
step3_total_occupations = len(step3_income_2020_2024_SOC2018)
step3_matched_occupations = step3_income_2020_2024_SOC2018["2018 SOC Code"].notna().sum()
step3_match_percentage = (step3_matched_occupations / step3_total_occupations) * 100

# ------------------------------
# Store results in a dictionary
# ------------------------------
match_summary = {
    "Step 1 (2015-2019 to 2010 SOC)": f"{step1_match_percentage:.2f}% matched ({step1_matched_occupations} out of {step1_total_occupations})",
    "Step 2 (2010 SOC to 2018 SOC)": f"{step2_match_percentage:.2f}% matched ({step2_matched_occupations} out of {step2_total_occupations})",
    "Step 3 (2020-2024 to 2018 SOC)": f"{step3_match_percentage:.2f}% matched ({step3_matched_occupations} out of {step3_total_occupations})",
}

# Print summary
for step, summary in match_summary.items():
    print(f"{step}: {summary}")

# ------------------------------
# Load Step 4 and Step 5 datasets
# ------------------------------
try:
    step4_income_2015_2019_SOC2018_automation = pd.read_excel("output/step4_income_2015_2019_SOC2018_ONET2019.xlsx", dtype=str)
    step4_income_2020_2024_SOC2018_automation = pd.read_excel("output/step4_income_2020_2024_SOC2018_ONET2019.xlsx", dtype=str)
    step5_income_2015_2019_SOC2018_automation_final = pd.read_excel("output/step5_income_2015_2019_SOC2018_automation.xlsx", dtype=str)
    step5_income_2020_2024_SOC2018_automation_final = pd.read_excel("output/step5_income_2020_2024_SOC2018_automation.xlsx", dtype=str)
except FileNotFoundError as e:
    print(f"‚ùå Error: {e}")
    exit()

# ------------------------------
# Step 4: Check match percentage for 2018 SOC to 2019 ONET-SOC
# ------------------------------
# Step 4 (2015-2019)
step4_total_2015_2019 = len(step4_income_2015_2019_SOC2018_automation)
step4_matched_2015_2019 = step4_income_2015_2019_SOC2018_automation["O*NET-SOC 2019 Code"].notna().sum()
step4_match_percentage_2015_2019 = (step4_matched_2015_2019 / step4_total_2015_2019) * 100

# Step 4 (2020-2024)
step4_total_2020_2024 = len(step4_income_2020_2024_SOC2018_automation)
step4_matched_2020_2024 = step4_income_2020_2024_SOC2018_automation["O*NET-SOC 2019 Code"].notna().sum()
step4_match_percentage_2020_2024 = (step4_matched_2020_2024 / step4_total_2020_2024) * 100

# ------------------------------
# Step 5: Check match percentage for 2019 ONET-SOC to Degree of Automation
# ------------------------------
# Step 5 (2015-2019)
step5_total_2015_2019 = len(step5_income_2015_2019_SOC2018_automation_final)
step5_matched_2015_2019 = step5_income_2015_2019_SOC2018_automation_final["Context"].notna().sum()
step5_match_percentage_2015_2019 = (step5_matched_2015_2019 / step5_total_2015_2019) * 100

# Step 5 (2020-2024)
step5_total_2020_2024 = len(step5_income_2020_2024_SOC2018_automation_final)
step5_matched_2020_2024 = step5_income_2020_2024_SOC2018_automation_final["Context"].notna().sum()
step5_match_percentage_2020_2024 = (step5_matched_2020_2024 / step5_total_2020_2024) * 100

# ------------------------------
# Store results in a dictionary and print
# ------------------------------
match_summary_step4_step5 = {
    "Step 4 (2015-2019 to 2019 ONET-SOC)": f"{step4_match_percentage_2015_2019:.2f}% matched ({step4_matched_2015_2019} out of {step4_total_2015_2019})",
    "Step 4 (2020-2024 to 2019 ONET-SOC)": f"{step4_match_percentage_2020_2024:.2f}% matched ({step4_matched_2020_2024} out of {step4_total_2020_2024})",
    "Step 5 (2015-2019 to Degree of Automation)": f"{step5_match_percentage_2015_2019:.2f}% matched ({step5_matched_2015_2019} out of {step5_total_2015_2019})",
    "Step 5 (2020-2024 to Degree of Automation)": f"{step5_match_percentage_2020_2024:.2f}% matched ({step5_matched_2020_2024} out of {step5_total_2020_2024})",
}

# Print summary
for step, summary in match_summary_step4_step5.items():
    print(f"{step}: {summary}")

Step 1 (2015-2019 to 2010 SOC): 98.94% matched (2808 out of 2838)
Step 2 (2010 SOC to 2018 SOC): 71.95% matched (2178 out of 3027)
Step 3 (2020-2024 to 2018 SOC): 99.16% matched (2963 out of 2988)
Step 4 (2015-2019 to 2019 ONET-SOC): 79.06% matched (3205 out of 4054)
Step 4 (2020-2024 to 2019 ONET-SOC): 77.99% matched (3104 out of 3980)
Step 5 (2015-2019 to Degree of Automation): 69.24% matched (2787 out of 4025)
Step 5 (2020-2024 to Degree of Automation): 70.25% matched (2775 out of 3950)


In [15]:
# ------------------------------
# Step 6: Concatenate Final Income Data
# ------------------------------
# Read both final datasets
income_2015_2019_final = pd.read_excel(step5_output_2015_2019, dtype=str)
income_2020_2024_final = pd.read_excel(step5_output_2020_2024, dtype=str)

# Concatenate both datasets
final_merged_income_automation = pd.concat([income_2015_2019_final, income_2020_2024_final], ignore_index=True)

# Save the final merged dataset
final_output_path = f"{output_dir}/final_merged_income_automation_2015_2024.xlsx"
final_merged_income_automation.to_excel(final_output_path, index=False)

# ------------------------------
# Step 6: Check match percentage for Context in the final dataset
# ------------------------------
total_occupations = len(final_merged_income_automation)
matched_occupations = final_merged_income_automation["Context"].notna().sum()
match_percentage = (matched_occupations / total_occupations) * 100

# Print summary
print(f"‚úÖ Final Output: {final_output_path}")
print(f"üìä Step 6: {match_percentage:.2f}% occupations have a match in the 'Context' column "
      f"({matched_occupations} out of {total_occupations}).")

‚úÖ Final Output: output/final_merged_income_automation_2015_2024.xlsx
üìä Step 6: 69.74% occupations have a match in the 'Context' column (5562 out of 7975).


make sure all occupations from the income data are retained 

In [16]:
# Load final merged datasets
final_2015_2019 = pd.read_excel(step5_output_2015_2019)
final_2020_2024 = pd.read_excel(step5_output_2020_2024)

# Count unique occupations in original income data
orig_occupations_2015_2019 = set(income_2015_2019["Occupation title"].unique())
orig_occupations_2020_2024 = set(income_2020_2024["Occupation title"].unique())

# Count unique occupations in final datasets
final_occupations_2015_2019 = set(final_2015_2019["Occupation title"].unique())
final_occupations_2020_2024 = set(final_2020_2024["Occupation title"].unique())

# Check if any occupations are missing
missing_2015_2019 = orig_occupations_2015_2019 - final_occupations_2015_2019
missing_2020_2024 = orig_occupations_2020_2024 - final_occupations_2020_2024

# Display missing occupations, if any
print(f"Missing occupations in 2015-2019: {len(missing_2015_2019)}")
print(f"Missing occupations in 2020-2024: {len(missing_2020_2024)}")

if missing_2015_2019:
    print("Some occupations from 2015-2019 are missing in the final dataset:", missing_2015_2019)
if missing_2020_2024:
    print("Some occupations from 2020-2024 are missing in the final dataset:", missing_2020_2024)

Missing occupations in 2015-2019: 0
Missing occupations in 2020-2024: 0
