In [22]:
import pandas as pd

# Load CSVs
all_ages = pd.read_csv("../data/cleaned/all_ages_cleaned.csv")
recent_grads = pd.read_csv("../data/cleaned/recent_grads_cleaned.csv")
grad_students = pd.read_csv("../data/cleaned/grad_students_cleaned.csv")
women_stem = pd.read_csv("../data/cleaned/women_stem_cleaned.csv")
majors_list = pd.read_csv("../data/cleaned/majors_list_cleaned.csv")

# Ensure all Major_code columns are int
all_ages['Major_code'] = all_ages['Major_code'].astype(int)
majors_list = majors_list.rename(columns={"FOD1P": "Major_code"})

# Remove leading/trailing spaces and non-numeric values
majors_list['Major_code'] = majors_list['Major_code'].astype(str).str.strip()  # strip spaces
majors_list = majors_list[majors_list['Major_code'].str.isnumeric()]          # keep only numeric rows

# Now cast to int
majors_list['Major_code'] = majors_list['Major_code'].astype(int)

print("Cleaned majors_list shape:", majors_list.shape)
recent_grads['Major_code'] = recent_grads['Major_code'].astype(int)
grad_students['Major_code'] = grad_students['Major_code'].astype(int)
women_stem['Major_code'] = women_stem['Major_code'].astype(int)

# Rename and cast FOD1P in majors_list
majors_list = majors_list.rename(columns={"FOD1P": "Major_code"})
majors_list['Major_code'] = majors_list['Major_code'].astype(int)

# Start merging
merged = all_ages.copy()

merged = merged.merge(
    recent_grads[['Major_code','ShareWomen','College_jobs','Non_college_jobs','Low_wage_jobs']],
    on='Major_code', how='left'
)

merged = merged.merge(
    grad_students[['Major_code','Grad_median','Nongrad_median','salary_premium']],
    on='Major_code', how='left'
)

merged = merged.merge(
    women_stem[['Major_code','Total','Men','Women']],
    on='Major_code', how='left', suffixes=('_ws','_ws_men')
)

merged = merged.merge(
    majors_list[['Major_code','Major_Category']],
    on='Major_code', how='left'
)

# Drop duplicate columns if any
merged = merged.loc[:,~merged.columns.duplicated()]

# Save
merged.to_csv("../data/cleaned/college_majors_merged.csv", index=False)

print("Merge successful! Shape:", merged.shape)


Cleaned majors_list shape: (173, 3)
Merge successful! Shape: (173, 25)
