In [19]:
# Imports necessary for the project
import pandas as pd
import numpy as np 
import os 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

Data Collection and Cleaning

In [20]:
# Loading the downloaded data
df = pd.read_csv("/Users/shivasaivummaji/Desktop/CS:DS/Semesters/Spring 2025/Labor Economics/Project/cps_00006.csv.gz")
df.head()

Unnamed: 0,YEAR,SERIAL,MONTH,HWTFINL,CPSID,ASECFLAG,ASECWTH,PERNUM,WTFINL,CPSIDP,...,SEX,RACE,EMPSTAT,UHRSWORKT,EDUC,EARNWT,INCWAGE,HOURWAGE,PAIDHOUR,EARNWEEK
0,2020,1,3,,20190302844900,1.0,1560.3756,1,,20190302844901,...,2,100,10,40,73,0.0,52500.0,999.99,0,9999.99
1,2020,1,3,,20190302844900,1.0,1560.3756,2,,20190302844902,...,1,100,36,999,73,0.0,56000.0,999.99,0,9999.99
2,2020,2,3,,20181202843500,1.0,986.5948,1,,20181202843501,...,1,100,10,48,73,4871.85,34000.0,999.99,1,654.0
3,2020,2,3,,20181202843500,1.0,986.5948,2,,20181202843502,...,2,100,36,999,50,3843.59,0.0,999.99,0,9999.99
4,2020,3,3,,20190202831700,1.0,1519.0704,1,,20190202831701,...,2,100,10,44,73,0.0,40000.0,999.99,0,9999.99


In [21]:
# Keep workers who are in the working-age between 18 to 65.
workers = df[df["AGE"].between(18, 65)]

# Keep only employed individuals
workers = workers.query("EMPSTAT == 10")

# Replace default values with np.nan
workers.loc[:, 'HOURWAGE'] = workers['HOURWAGE'].replace(999.99, np.nan)
workers.loc[:, 'UHRSWORKT'] = workers['UHRSWORKT'].replace([997, 998, 999], np.nan)
workers.loc[:, 'INCWAGE'] = workers['INCWAGE'].replace(9999999, np.nan)

# Construct hourly wages for the workers
workers["hourly_wage"] = np.where(
    (workers["PAIDHOUR"] == 1) & (~workers["HOURWAGE"].isna()),
    workers["HOURWAGE"],
    workers["INCWAGE"] / (workers["UHRSWORKT"] * 52)
)

# Drop invalid/NaN hourly wages
workers = workers.dropna(subset=["hourly_wage"])

# Keep only the columns mentioned in my initial document:
keep_cols = [
    'SEX', 'RACE', 'AGE', 'EDUC', 'UHRSWORKT', 'hourly_wage', 'INCWAGE'
]
workers = workers[keep_cols]

workers


  workers.loc[:, 'UHRSWORKT'] = workers['UHRSWORKT'].replace([997, 998, 999], np.nan)


Unnamed: 0,SEX,RACE,AGE,EDUC,UHRSWORKT,hourly_wage,INCWAGE
0,2,100,63,73,40.0,25.240385,52500.0
2,1,100,64,73,48.0,13.621795,34000.0
4,2,100,54,73,44.0,17.482517,40000.0
6,1,100,37,73,21.0,7.714286,8424.0
14,2,100,35,91,40.0,20.192308,42000.0
...,...,...,...,...,...,...,...
4756385,2,200,36,111,40.0,12.019231,25000.0
4756386,1,200,27,73,40.0,14.423077,30000.0
4756388,2,651,35,81,8.0,60.096154,25000.0
4756392,1,100,46,92,40.0,38.461538,80000.0


In [22]:
# Converting categorical variables (SEX, RACE, EDUC) to labels

map_sex = {1: 'Male', 2: 'Female'}
workers['SEX'] = workers['SEX'].map(map_sex)

map_race = {
    100: 'White',
    200: 'Black',
    300: 'American Indian',
    651: 'Asian',
    652: 'Hawaiian/Pacific Islander',
    801: 'Other',
    802: 'Two or more races'
}
workers['RACE'] = workers['RACE'].map(map_race)

map_edu = {
    31: 'Less than High School',
    32: 'High School Graduate',
    40: 'Some College, No Degree',
    41: 'Associate Degree',
    50: 'Bachelor’s Degree',
    60: 'Master’s Degree',
    70: 'Professional Degree',
    71: 'Doctorate Degree',
    73: 'Bachelor’s Degree',
    81: 'Master’s Degree',
    91: 'Some College, No Degree',
    92: 'Associate Degree',
    111: 'Less than High School'
}
workers['EDUC'] = workers['EDUC'].map(map_edu)

workers

Unnamed: 0,SEX,RACE,AGE,EDUC,UHRSWORKT,hourly_wage,INCWAGE
0,Female,White,63,Bachelor’s Degree,40.0,25.240385,52500.0
2,Male,White,64,Bachelor’s Degree,48.0,13.621795,34000.0
4,Female,White,54,Bachelor’s Degree,44.0,17.482517,40000.0
6,Male,White,37,Bachelor’s Degree,21.0,7.714286,8424.0
14,Female,White,35,"Some College, No Degree",40.0,20.192308,42000.0
...,...,...,...,...,...,...,...
4756385,Female,Black,36,Less than High School,40.0,12.019231,25000.0
4756386,Male,Black,27,Bachelor’s Degree,40.0,14.423077,30000.0
4756388,Female,Asian,35,Master’s Degree,8.0,60.096154,25000.0
4756392,Male,White,46,Associate Degree,40.0,38.461538,80000.0


In [23]:
workers.to_csv("cleaned_data.csv", index=False)

Exploratory Analysis