# College Scorecard ROI Analysis - Data Preprocessing


### Part 1: Setup & Data Loading

In [2]:
# Import necessary libraries
import pandas as pd
import gdown
from pathlib import Path



In [3]:
# 1. Setup file path and download link
data_dir = Path('../data/raw')      # Directory to store raw data
file_path = data_dir / 'Most-Recent-Cohorts-Field-of-Study.csv'   # Define the final data file name and path
drive_link = "https://drive.google.com/file/d/1ER-vyYO-dxN-qLAwDFsovOU_-JSw30SP/view?usp=sharing"     # Google Drive link to download the data

# 2. Check if file exists, download if it doesn't
data_dir.mkdir(parents=True, exist_ok=True)     # Make sure the data directory exists, create it if not

if not file_path.exists():
    print(f"File not found at '{file_path}', starting download from Google Drive...")
    gdown.download(drive_link, str(file_path), fuzzy=True)
    print("Download complete!")
else:
    print(f"File already exists at '{file_path}', skipping download.")

# 3. Load the CSV data into a DataFrame
df_raw = pd.read_csv(file_path, low_memory=False)     
print("Data loaded successfully!")


File already exists at '..\data\raw\Most-Recent-Cohorts-Field-of-Study.csv', skipping download.
Data loaded successfully!


### Part 2: Initial Data Exploration

In [5]:
# Display the shape of the dataset.
print(f"Dataset shape (rows, columns): {df_raw.shape}")

# Display the first 5 rows of the data.
print("\nFirst 5 rows of the raw data:")
display(df_raw.head())
print("\n")

# Display a summary of the data types.
print("\nData types and non-null values:")
df_raw.info()

Dataset shape (rows, columns): (229188, 174)

First 5 rows of the raw data:


Unnamed: 0,UNITID,OPEID6,INSTNM,CONTROL,MAIN,CIPCODE,CIPDESC,CREDLEV,CREDDESC,IPEDSCOUNT1,...,EARN_COUNT_PELL_WNE_5YR,EARN_PELL_WNE_MDN_5YR,EARN_COUNT_NOPELL_WNE_5YR,EARN_NOPELL_WNE_MDN_5YR,EARN_COUNT_MALE_WNE_5YR,EARN_MALE_WNE_MDN_5YR,EARN_COUNT_NOMALE_WNE_5YR,EARN_NOMALE_WNE_MDN_5YR,EARN_COUNT_HIGH_CRED_5YR,EARN_IN_STATE_5YR
0,100654.0,1002,Alabama A & M University,Public,1,100,"Agriculture, General.",3,Bachelor's Degree,,...,PS,PS,PS,PS,PS,PS,PS,PS,PS,PS
1,100654.0,1002,Alabama A & M University,Public,1,101,Agricultural Business and Management.,3,Bachelor's Degree,,...,PS,PS,PS,PS,PS,PS,PS,PS,PS,PS
2,100654.0,1002,Alabama A & M University,Public,1,109,Animal Sciences.,3,Bachelor's Degree,3.0,...,PS,PS,PS,PS,PS,PS,PS,PS,PS,PS
3,100654.0,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,3,Bachelor's Degree,7.0,...,PS,PS,PS,PS,PS,PS,PS,PS,PS,PS
4,100654.0,1002,Alabama A & M University,Public,1,110,Food Science and Technology.,5,Master's Degree,4.0,...,PS,PS,PS,PS,PS,PS,PS,PS,PS,PS





Data types and non-null values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229188 entries, 0 to 229187
Columns: 174 entries, UNITID to EARN_IN_STATE_5YR
dtypes: float64(3), int64(5), object(166)
memory usage: 304.3+ MB


### Part 3: Column Selection for ROI Analysis

In [9]:
# Define a list of columns needed for our analysis.
columns_to_keep = [
    # Earnings Data (our primary outcome)
    'EARN_MDN_5YR',         # Median earnings 5 years after graduation
    
    # Debt Data (our primary cost)
    'DEBT_ALL_STGP_ANY_MDN',  # Median debt for all students, regardless of completion
    'DEBT_ALL_STGP_ANY_MDN10YRPAY', # 10-year median monthly payment
        
    # Program Identifiers
    'INSTNM',               # Institution name
    'CIPCODE',              # Classification of Instructional Programs code
    'CIPDESC',              # Description of the CIP code (the major)
    'CREDLEV',              # Credential level (e.g., 3 for Bachelor's)
    'CREDDESC',             # Description of the credential level
    
    # Institution / Program Info
    'CONTROL',              # Control of institution (Public, Private nonprofit, Private for-profit)
    'IPEDSCOUNT2',          # Number of graduates in the program cohort
]

# Select these columns from the raw DataFrame to create a new one.
df_selected = df_raw[columns_to_keep].copy()

# Verify the shape and content of the new DataFrame.
print("DataFrame after selecting columns:")
print(f"New shape (rows, columns): {df_selected.shape}")

print("\nFirst 5 rows of the selected data:")
display(df_selected.head())

print("\nInfo of the new DataFrame:")
df_selected.info()

DataFrame after selecting columns:
New shape (rows, columns): (229188, 10)

First 5 rows of the selected data:


Unnamed: 0,EARN_MDN_5YR,DEBT_ALL_STGP_ANY_MDN,DEBT_ALL_STGP_ANY_MDN10YRPAY,INSTNM,CIPCODE,CIPDESC,CREDLEV,CREDDESC,CONTROL,IPEDSCOUNT2
0,PS,PS,PS,Alabama A & M University,100,"Agriculture, General.",3,Bachelor's Degree,Public,
1,PS,PS,PS,Alabama A & M University,101,Agricultural Business and Management.,3,Bachelor's Degree,Public,
2,PS,PS,PS,Alabama A & M University,109,Animal Sciences.,3,Bachelor's Degree,Public,9.0
3,PS,PS,PS,Alabama A & M University,110,Food Science and Technology.,3,Bachelor's Degree,Public,10.0
4,PS,PS,PS,Alabama A & M University,110,Food Science and Technology.,5,Master's Degree,Public,6.0



Info of the new DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229188 entries, 0 to 229187
Data columns (total 10 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   EARN_MDN_5YR                  229188 non-null  object 
 1   DEBT_ALL_STGP_ANY_MDN         220333 non-null  object 
 2   DEBT_ALL_STGP_ANY_MDN10YRPAY  220333 non-null  object 
 3   INSTNM                        229188 non-null  object 
 4   CIPCODE                       229188 non-null  int64  
 5   CIPDESC                       229188 non-null  object 
 6   CREDLEV                       229188 non-null  int64  
 7   CREDDESC                      229188 non-null  object 
 8   CONTROL                       229188 non-null  object 
 9   IPEDSCOUNT2                   194383 non-null  float64
dtypes: float64(1), int64(2), object(7)
memory usage: 17.5+ MB


### Part 4: Data Cleaning & Type Conversion
Convert to Numeric and Analyze Missing Data

In [10]:
df_cleaned = df_selected.copy()

# Define the columns that need to be converted to a numeric type.
numeric_cols = ['EARN_MDN_5YR', 'DEBT_ALL_STGP_ANY_MDN', 'DEBT_ALL_STGP_ANY_MDN10YRPAY']

# Loop through these columns and convert their data type to numeric.
for col in numeric_cols:
    # errors='coerce' will automatically replace any values that can't be converted (like 'PrivacySuppressed') with NaN (missing value).
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')


# Check the data types again to confirm the conversion was successful.
print("\nData types after conversion:")
df_cleaned.info()


# Next, inspect the missing values in our key columns.
print("\nMissing value analysis:")

# Calculate the total number of missing values for each column.
missing_counts = df_cleaned.isnull().sum()

# Calculate the percentage of missing values relative to the total number of rows.
missing_percentage = (missing_counts / len(df_cleaned) * 100).round(2)

# Create a summary table to display the missing value information clearly.
missing_summary = pd.DataFrame({
    'missing_count': missing_counts, 
    'missing_percentage': missing_percentage
})

# Filter for columns that have missing values and sort them by percentage in descending order.
print(missing_summary[missing_summary['missing_count'] > 0].sort_values(by='missing_percentage', ascending=False))


Data types after conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229188 entries, 0 to 229187
Data columns (total 10 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   EARN_MDN_5YR                  51551 non-null   float64
 1   DEBT_ALL_STGP_ANY_MDN         39385 non-null   float64
 2   DEBT_ALL_STGP_ANY_MDN10YRPAY  39385 non-null   float64
 3   INSTNM                        229188 non-null  object 
 4   CIPCODE                       229188 non-null  int64  
 5   CIPDESC                       229188 non-null  object 
 6   CREDLEV                       229188 non-null  int64  
 7   CREDDESC                      229188 non-null  object 
 8   CONTROL                       229188 non-null  object 
 9   IPEDSCOUNT2                   194383 non-null  float64
dtypes: float64(4), int64(2), object(4)
memory usage: 17.5+ MB

Missing value analysis:
                              missing_count  missin

### Part 5: Feature Engineering - Calculate Core ROI Metrics

In [11]:
df_roi = df_cleaned.copy()

# 1. Calculate the Earnings-to-Debt Ratio. This is our core ROI metric. Higher is better.
df_roi['ROI_EARNINGS_TO_DEBT'] = df_roi['EARN_MDN_5YR'] / df_roi['DEBT_ALL_STGP_ANY_MDN']

# 2. Calculate the Payback Years, which is the Debt-to-Income ratio. Lower is better.
df_roi['PAYBACK_YEARS'] = df_roi['DEBT_ALL_STGP_ANY_MDN'] / df_roi['EARN_MDN_5YR']

# 3. Calculate the monthly payment as a percentage of monthly earnings. This measures affordability. Lower is better.
monthly_earnings = df_roi['EARN_MDN_5YR'] / 12
df_roi['MONTHLY_PAYMENT_PCT'] = (df_roi['DEBT_ALL_STGP_ANY_MDN10YRPAY'] / monthly_earnings) * 100


# Display a preview of the data with the new columns.
print("\nData preview with new ROI metrics:")
display(df_roi.head())

# Display descriptive statistics for the newly calculated metrics to understand their distribution.
print("\nStatistics for new ROI metrics:")
display(df_roi[['ROI_EARNINGS_TO_DEBT', 'PAYBACK_YEARS', 'MONTHLY_PAYMENT_PCT']].describe())


Data preview with new ROI metrics:


Unnamed: 0,EARN_MDN_5YR,DEBT_ALL_STGP_ANY_MDN,DEBT_ALL_STGP_ANY_MDN10YRPAY,INSTNM,CIPCODE,CIPDESC,CREDLEV,CREDDESC,CONTROL,IPEDSCOUNT2,ROI_EARNINGS_TO_DEBT,PAYBACK_YEARS,MONTHLY_PAYMENT_PCT
0,,,,Alabama A & M University,100,"Agriculture, General.",3,Bachelor's Degree,Public,,,,
1,,,,Alabama A & M University,101,Agricultural Business and Management.,3,Bachelor's Degree,Public,,,,
2,,,,Alabama A & M University,109,Animal Sciences.,3,Bachelor's Degree,Public,9.0,,,
3,,,,Alabama A & M University,110,Food Science and Technology.,3,Bachelor's Degree,Public,10.0,,,
4,,,,Alabama A & M University,110,Food Science and Technology.,5,Master's Degree,Public,6.0,,,



Statistics for new ROI metrics:


Unnamed: 0,ROI_EARNINGS_TO_DEBT,PAYBACK_YEARS,MONTHLY_PAYMENT_PCT
count,30792.0,30792.0,30792.0
mean,2.992728,0.414483,5.273066
std,1.625808,0.194908,2.479519
min,0.417793,0.026863,0.339933
25%,1.95616,0.279397,3.555956
50%,2.643751,0.37825,4.813187
75%,3.579137,0.511206,6.505312
max,37.226545,2.393532,30.445681


### Part 6: Data Filtering for Quality and Reliability

In [12]:
df_filtered = df_roi.copy()

# Record the initial number of rows to observe the effect of our filters.
initial_rows = len(df_filtered)
print(f"Starting with {initial_rows} rows before filtering.")

# Filter 1: Remove rows where core metrics are NaN.
# We keep only the rows where ROI was successfully calculated.
rows_before_filter1 = len(df_filtered)
df_filtered = df_filtered.dropna(subset=['ROI_EARNINGS_TO_DEBT'])
rows_after_filter1 = len(df_filtered)
print(f"Step 1: Removed {rows_before_filter1 - rows_after_filter1} rows with missing ROI values. {rows_after_filter1} rows remaining.")

# Filter 2: Remove programs with too few graduates (small sample size, unreliable results).
# We require a program to have at least 10 graduates.
rows_before_filter2 = len(df_filtered)
df_filtered = df_filtered[df_filtered['IPEDSCOUNT2'] >= 10]
rows_after_filter2 = len(df_filtered)
print(f"Step 2: Removed {rows_before_filter2 - rows_after_filter2} rows for small program size (< 10 graduates). {rows_after_filter2} rows remaining.")

# Filter 3: Remove extreme values for earnings and debt (unreasonable data).
# Here, we assume annual earnings below $10k or above $500k are outliers.
rows_before_filter3 = len(df_filtered)
df_filtered = df_filtered[df_filtered['EARN_MDN_5YR'].between(10000, 500000)]
rows_after_filter3 = len(df_filtered)
print(f"Step 3: Removed {rows_before_filter3 - rows_after_filter3} rows for unreasonable earnings. {rows_after_filter3} rows remaining.")

# Print the final number of rows remaining and the proportion.
final_rows = len(df_filtered)
print(f"\nFiltering complete. We have {final_rows} rows remaining for analysis.")
print(f"This is { (final_rows / initial_rows * 100):.2f}% of the original dataset.")

# Finally, let's review the statistics of our cleaned data.
print("\nStatistics for new ROI metrics AFTER filtering:")
display(df_filtered[['ROI_EARNINGS_TO_DEBT', 'PAYBACK_YEARS', 'MONTHLY_PAYMENT_PCT']].describe())

Starting with 229188 rows before filtering.
Step 1: Removed 198396 rows with missing ROI values. 30792 rows remaining.
Step 2: Removed 3885 rows for small program size (< 10 graduates). 26907 rows remaining.
Step 3: Removed 2 rows for unreasonable earnings. 26905 rows remaining.

Filtering complete. We have 26905 rows remaining for analysis.
This is 11.74% of the original dataset.

Statistics for new ROI metrics AFTER filtering:


Unnamed: 0,ROI_EARNINGS_TO_DEBT,PAYBACK_YEARS,MONTHLY_PAYMENT_PCT
count,26905.0,26905.0,26905.0
mean,3.062924,0.400138,5.090495
std,1.62324,0.17859,2.271892
min,0.57302,0.031024,0.392595
25%,2.012235,0.274123,3.487982
50%,2.702121,0.37008,4.709687
75%,3.648,0.49696,6.320388
max,32.233091,1.745141,22.173556


### Part 7: Advanced Feature Engineering - Create Categorical Features

In [19]:
df_final = df_filtered.copy()

# 1. Create a more readable credential level name.
credential_map = {
    1: 'Undergraduate Certificate',
    2: 'Associate Degree',
    3: 'Bachelor Degree',
    4: 'Post-baccalaureate Certificate',
    5: 'Master Degree',
    6: 'Doctoral Degree',
    7: 'First Professional Degree',
    8: 'Graduate Certificate'
}
df_final['CREDENTIAL_LEVEL_NAME'] = df_final['CREDLEV'].map(credential_map)

# 2. Create broad major fields from CIPCODE.
# We extract the first two digits of the CIPCODE to identify the general field of study.
df_final['CIP_2DIGIT'] = df_final['CIPCODE'].astype(str).str[:2]

major_map = {
    '11': 'Computer Science',
    '14': 'Engineering',
    '15': 'Engineering Technology',
    '26': 'Biological Sciences',
    '27': 'Mathematics',
    '40': 'Physical Sciences',
    '52': 'Business',
    '51': 'Health Professions',
    '42': 'Psychology',
    '45': 'Social Sciences',
    '23': 'English Language',
    '24': 'Liberal Arts & Humanities',
    '50': 'Visual & Performing Arts',
    '13': 'Education',
}
df_final['MAJOR_FIELD'] = df_final['CIP_2DIGIT'].map(major_map).fillna('Other')

# 3. Bin the continuous ROI values into categorical labels.
# pd.cut helps segment numbers into bins.
roi_bins = [0, 1, 1.5, 2.5, 4, float('inf')]
roi_labels = ['Poor (<1)', 'Low (1-1.5)', 'Average (1.5-2.5)', 'Good (2.5-4)', 'Excellent (>4)']
df_final['ROI_CATEGORY'] = pd.cut(df_final['ROI_EARNINGS_TO_DEBT'], bins=roi_bins, labels=roi_labels, right=False)

# 4. Bin the monthly payment percentage into categorical labels.
afford_bins = [0, 8, 12, 20, float('inf')]
afford_labels = ['Very Affordable (<8%)', 'Affordable (8-12%)', 'Moderate (12-20%)', 'Expensive (>20%)']
df_final['AFFORDABILITY'] = pd.cut(df_final['MONTHLY_PAYMENT_PCT'], bins=afford_bins, labels=afford_labels, right=False)


# Preview the newly created columns.
print("\nData preview with new categorical features:")
display(df_final.head())
display(df_final[['INSTNM', 'MAJOR_FIELD', 'CIPDESC', 'CREDENTIAL_LEVEL_NAME', 'ROI_EARNINGS_TO_DEBT', 'ROI_CATEGORY', 'AFFORDABILITY']].head())

# Check the distribution of the new categories.
print("\nDistribution of ROI Categories:")
print(df_final['ROI_CATEGORY'].value_counts(normalize=True).sort_index())


Data preview with new categorical features:


Unnamed: 0,EARN_MDN_5YR,DEBT_ALL_STGP_ANY_MDN,DEBT_ALL_STGP_ANY_MDN10YRPAY,INSTNM,CIPCODE,CIPDESC,CREDLEV,CREDDESC,CONTROL,IPEDSCOUNT2,ROI_EARNINGS_TO_DEBT,PAYBACK_YEARS,MONTHLY_PAYMENT_PCT,CREDENTIAL_LEVEL_NAME,CIP_2DIGIT,MAJOR_FIELD,ROI_CATEGORY,AFFORDABILITY
16,85218.0,31000.0,329.0,Alabama A & M University,1101,"Computer and Information Sciences, General.",3,Bachelor's Degree,Public,37.0,2.748968,0.363773,4.632824,Bachelor Degree,11,Computer Science,Good (2.5-4),Very Affordable (<8%)
29,90409.0,35000.0,371.0,Alabama A & M University,1410,"Electrical, Electronics and Communications Eng...",3,Bachelor's Degree,Public,20.0,2.583114,0.38713,4.924289,Bachelor Degree,14,Engineering,Good (2.5-4),Very Affordable (<8%)
30,82929.0,30500.0,323.0,Alabama A & M University,1419,Mechanical Engineering.,3,Bachelor's Degree,Public,41.0,2.718984,0.367784,4.673878,Bachelor Degree,14,Engineering,Good (2.5-4),Very Affordable (<8%)
39,46627.0,33000.0,350.0,Alabama A & M University,2401,"Liberal Arts and Sciences, General Studies and...",3,Bachelor's Degree,Public,13.0,1.412939,0.707744,9.007657,Bachelor Degree,24,Liberal Arts & Humanities,Low (1-1.5),Affordable (8-12%)
40,40721.0,28271.0,300.0,Alabama A & M University,2601,"Biology, General.",3,Bachelor's Degree,Public,95.0,1.440381,0.694261,8.840647,Bachelor Degree,26,Biological Sciences,Low (1-1.5),Affordable (8-12%)


Unnamed: 0,INSTNM,MAJOR_FIELD,CIPDESC,CREDENTIAL_LEVEL_NAME,ROI_EARNINGS_TO_DEBT,ROI_CATEGORY,AFFORDABILITY
16,Alabama A & M University,Computer Science,"Computer and Information Sciences, General.",Bachelor Degree,2.748968,Good (2.5-4),Very Affordable (<8%)
29,Alabama A & M University,Engineering,"Electrical, Electronics and Communications Eng...",Bachelor Degree,2.583114,Good (2.5-4),Very Affordable (<8%)
30,Alabama A & M University,Engineering,Mechanical Engineering.,Bachelor Degree,2.718984,Good (2.5-4),Very Affordable (<8%)
39,Alabama A & M University,Liberal Arts & Humanities,"Liberal Arts and Sciences, General Studies and...",Bachelor Degree,1.412939,Low (1-1.5),Affordable (8-12%)
40,Alabama A & M University,Biological Sciences,"Biology, General.",Bachelor Degree,1.440381,Low (1-1.5),Affordable (8-12%)



Distribution of ROI Categories:
ROI_CATEGORY
Poor (<1)            0.009664
Low (1-1.5)          0.062851
Average (1.5-2.5)    0.359710
Good (2.5-4)         0.378331
Excellent (>4)       0.189444
Name: proportion, dtype: float64


In [20]:
# Verify Major Field Distribution

# Use .value_counts() to count the occurrences of each value in the 'MAJOR_FIELD' column.
print("Count of programs in each MAJOR_FIELD category:")
major_counts = df_final['MAJOR_FIELD'].value_counts()
display(major_counts)

# Use the normalize=True parameter to see the percentage of each category.
print("\nPercentage of programs in each MAJOR_FIELD category:")
major_percentages = df_final['MAJOR_FIELD'].value_counts(normalize=True) * 100
display(major_percentages)

Count of programs in each MAJOR_FIELD category:


MAJOR_FIELD
Other                        6724
Health Professions           5397
Business                     3788
Social Sciences              1562
Engineering                  1319
Visual & Performing Arts     1272
Biological Sciences          1142
Liberal Arts & Humanities    1118
Education                    1087
Psychology                    983
Computer Science              962
English Language              536
Engineering Technology        395
Physical Sciences             371
Mathematics                   249
Name: count, dtype: int64


Percentage of programs in each MAJOR_FIELD category:


MAJOR_FIELD
Other                        24.991637
Health Professions           20.059469
Business                     14.079167
Social Sciences               5.805612
Engineering                   4.902434
Visual & Performing Arts      4.727746
Biological Sciences           4.244564
Liberal Arts & Humanities     4.155361
Education                     4.040141
Psychology                    3.653596
Computer Science              3.575544
English Language              1.992195
Engineering Technology        1.468129
Physical Sciences             1.378926
Mathematics                   0.925479
Name: proportion, dtype: float64

### Part 8: Final Validation & Export

In [21]:
# 1. Perform a final check.
print("Final DataFrame shape:", df_final.shape)
print("\nFinal DataFrame columns:")
print(df_final.columns.tolist())
print("\nFinal DataFrame info:")
df_final.info()

Final DataFrame shape: (26905, 18)

Final DataFrame columns:
['EARN_MDN_5YR', 'DEBT_ALL_STGP_ANY_MDN', 'DEBT_ALL_STGP_ANY_MDN10YRPAY', 'INSTNM', 'CIPCODE', 'CIPDESC', 'CREDLEV', 'CREDDESC', 'CONTROL', 'IPEDSCOUNT2', 'ROI_EARNINGS_TO_DEBT', 'PAYBACK_YEARS', 'MONTHLY_PAYMENT_PCT', 'CREDENTIAL_LEVEL_NAME', 'CIP_2DIGIT', 'MAJOR_FIELD', 'ROI_CATEGORY', 'AFFORDABILITY']

Final DataFrame info:
<class 'pandas.core.frame.DataFrame'>
Index: 26905 entries, 16 to 219073
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   EARN_MDN_5YR                  26905 non-null  float64 
 1   DEBT_ALL_STGP_ANY_MDN         26905 non-null  float64 
 2   DEBT_ALL_STGP_ANY_MDN10YRPAY  26905 non-null  float64 
 3   INSTNM                        26905 non-null  object  
 4   CIPCODE                       26905 non-null  int64   
 5   CIPDESC                       26905 non-null  object  
 6   CREDLEV            

In [22]:
# 2. Define the output path.
# We will save the processed data in the data/processed directory.
processed_dir = Path('../data/processed')
processed_dir.mkdir(parents=True, exist_ok=True) # Ensure the directory exists.
output_path = processed_dir / 'field_of_study_processed.csv'

# 3. Export the final DataFrame to a CSV file.
df_final.to_csv(output_path, index=False)

print(f"\nSuccessfully exported the processed data to: {output_path}")
print("This file is now ready for analysis and visualization.")


Successfully exported the processed data to: ..\data\processed\field_of_study_processed.csv
This file is now ready for analysis and visualization.
