In [6]:
import pandas as pd
import re
from pathlib import Path


# Load College Scorecard Majors Data
input_path = Path("..") / "data" / "Raw_data" / "Most-Recent-Cohorts-Field-of-Study.csv"

# Read only the CIPDESC column for performance
df = pd.read_csv(input_path, usecols=["CIPDESC"])

print(f" Loaded {len(df):,} rows from College Scorecard file.")

# Clean and Normalize Majors
# Drop NaN and duplicates
df = df.dropna(subset=["CIPDESC"])
df = df.drop_duplicates(subset=["CIPDESC"])

# Rename column
df.rename(columns={"CIPDESC": "major_name"}, inplace=True)

# Clean formatting
def clean_major(name: str) -> str:
    name = name.strip()                              # remove extra spaces
    name = re.sub(r"\.$", "", name)                  # remove trailing full stop
    name = re.sub(r",\s*(General|Other).*", "", name, flags=re.IGNORECASE)  # remove ", General"/", Other"
    name = re.sub(r"\s+", " ", name)                 # normalize spaces
    return name

df["major_name"] = df["major_name"].apply(clean_major)

# Drop duplicates again after cleaning
df = df.drop_duplicates(subset=["major_name"])

# Sort alphabetically
df = df.sort_values("major_name").reset_index(drop=True)


# Save Cleaned Majors to data/clean/
output_dir = Path("..") / "data" / "clean"
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "us_majors.csv"

df.to_csv(output_path, index=False, encoding="utf-8")

print(f" Cleaned majors and saved")
print(f"Total unique majors: {len(df)}")

# Preview a few rows
df.head(15)


 Loaded 229,188 rows from College Scorecard file.
 Cleaned majors and saved
Total unique majors: 425


Unnamed: 0,major_name
0,Accounting and Computer Science
1,Accounting and Related Services
2,Advanced/Graduate Dentistry and Oral Sciences
3,"Aerospace, Aeronautical and Astronautical Engi..."
4,"African Languages, Literatures, and Linguistics"
5,Agricultural Business and Management
6,Agricultural Engineering
7,Agricultural Mechanization
8,Agricultural Production Operations
9,Agricultural Public Services
