In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [23]:
df = pd.read_csv('earnings_train.csv')
test_df = pd.read_csv("earnings_test_features.csv")

In [25]:
#print out basic info
df.head()
df.info()
df.describe()

test_df.head()

#isolate numerical and categorical columns
num_cols = df.select_dtypes(include="number").columns
cat_cols = df.select_dtypes(exclude="number").columns

#print them out
print("Numeric columns:", list(num_cols))
print("Categorical columns:", list(cat_cols))

#print out columns
for col in cat_cols:
    print(f"\nColumn: {col}")
    print(df[col].value_counts().head(10))  #top 10 unique values found in each categorical column
    print("Unique count:", df[col].nunique())

df[num_cols].describe().T

os.makedirs("plots", exist_ok=True)

#make histograms and save to folder
for col in num_cols:
    plt.figure(figsize=(6,4))
    df[col].hist(bins=30)
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig(f"plots/hist_{col}.png")
    plt.close()

#get minimum and maximum wage data
wage_cols = ["WAGE_YEAR1", "WAGE_YEAR2", "WAGE_YEAR3", "WAGE_YEAR4"]
wage_data = df[wage_cols]

min_vals = wage_data.min()
print("Minimum values per wage year column")
print(min_vals)

max_vals = wage_data.max()
print("Maximum values per wage year column")
print(max_vals)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20705 entries, 0 to 20704
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   DISTRICT_TYPE       20705 non-null  object 
 1   DISTRICT_NAME       20705 non-null  object 
 2   DISTRICT_CODE       17960 non-null  float64
 3   ACADEMIC_YEAR       20705 non-null  object 
 4   DEMO_CATEGORY       20705 non-null  object 
 5   STUDENT_POPULATION  20705 non-null  object 
 6   AWARD_CATEGORY      20705 non-null  object 
 7   WAGE_YEAR1          20705 non-null  float64
 8   WAGE_YEAR2          20705 non-null  float64
 9   WAGE_YEAR3          20705 non-null  float64
 10  WAGE_YEAR4          20705 non-null  float64
dtypes: float64(5), object(6)
memory usage: 1.7+ MB
Numeric columns: ['DISTRICT_CODE', 'WAGE_YEAR1', 'WAGE_YEAR2', 'WAGE_YEAR3', 'WAGE_YEAR4']
Categorical columns: ['DISTRICT_TYPE', 'DISTRICT_NAME', 'ACADEMIC_YEAR', 'DEMO_CATEGORY', 'STUDENT_POPULATION',

In [71]:
#define inputs and target
features = ["DISTRICT_TYPE", "DEMO_CATEGORY", "STUDENT_POPULATION", "AWARD_CATEGORY",
            "WAGE_YEAR1", "WAGE_YEAR2", "WAGE_YEAR3"]
target = ["WAGE_YEAR4"]

#get copies of features we want
X = df[features].copy()
y = df[target].copy()

#categorical cols we need to encode
categorical_cols = ["DISTRICT_TYPE", "DEMO_CATEGORY", "STUDENT_POPULATION", "AWARD_CATEGORY"]

#make dictionary to store LabelEncoders for each col
le_dict = {}
#for each column, init LabelEncoder, replace all empty spaces with unknown
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = X[col].astype(str).replace('', 'UNKNOWN')
    X[col] = le.fit_transform(X[col])
    le_dict[col] = le

#get training and testing data
#use 10% of data for testing, randomize using randomstate, shuffle data for better results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=42, shuffle=True)

#scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#init knn, set to get nearest 5 neighbors and train model
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

#predict target values and evaluate model
y_pred = knn.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)

print(f"Root Mean Squared Error: {rmse:.2f}")


Root Mean Squared Error: 2331.98


In [70]:
#get features we want from test csv
X_test_df = test_df[features].copy()

#for every col, use previously generated label encoder we stored, make sure all values are string and any null inputs are 
#discarded. additionally, ensure that all data in column is checked to see if it was seen during training, else assign -1
for col in categorical_cols:
    le = le_dict[col]
    X_test[col] = X_test[col].astype(str).replace('', 'UNKNOWN')
    X_test[col] = X_test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

#scale and predict, add to preds.csv
X_test_df_scaled = scaler.transform(X_test_df)
test_df_pred = knn.predict(X_test_df_scaled)
preds_df = pd.DataFrame({"WAGE_YEAR4": test_df_pred.flatten()})
preds_df.to_csv("preds.csv", index=False)

print("Generated preds.csv")


Generated preds.csv


In [None]:
#Reflection
#1. I found that the biggest predictor for WAGE_YEAR4 was WAGE_YEAR1-WAGE_YEAR3, as they are very strongly correlated. 
#Generally, wages showcase a positive growth trend, meaning that the model can infer the 4th year based on the growth in the
#prior 3 years

#2. The model points to people who have degrees and better socioeconomic statuses perform better across the board in terms of
#wages, pointing to how academic and developmental inequity can lead to long term effects. The data shows that ones academic
#experience and subsequent job experience are directly linked.

#3. I think a feature that could make the model more accurate would be average household income or school funding in the
#district the data is from. This could further expand the features regarding the link between socioeconomic status and
#performance in the job market. Also, the industry worked in could provide better wage prediction and also point to how
#differing statuses lead to different job markets.