In [918]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [919]:
df = pd.read_csv('salaries.csv')

In [920]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32028 entries, 0 to 32027
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               32028 non-null  object 
 1   Job Titles         32028 non-null  object 
 2   Department         32028 non-null  object 
 3   Full or Part-Time  32028 non-null  object 
 4   Salary or Hourly   32028 non-null  object 
 5   Typical Hours      7153 non-null   float64
 6   Annual Salary      24875 non-null  float64
 7   Hourly Rate        7153 non-null   float64
dtypes: float64(3), object(5)
memory usage: 2.0+ MB


Adding a salary feature which takes the Annual salary for salaried employees and calculated an annual salary for hourly employees.

In [921]:
df['Salary'] = df.apply(lambda row: row['Annual Salary'] if row['Salary or Hourly'] == 'SALARY' else row['Typical Hours'] * row['Hourly Rate'] * 52, axis=1)
df.drop(['Annual Salary', 'Typical Hours', 'Hourly Rate'], axis=1, inplace=True)

Categorising job titles as high skilled (2), medium skilled (1) or low skilled (0) by taking the mean salary for each title and comparing. 

In [922]:
titles = pd.DataFrame(df.groupby('Job Titles')['Salary'].mean()).reset_index()
def skill(row):
    if row['Salary'] <= 83250:
        return 0
    if row['Salary'] > 83250 and row['Salary'] < 127107:
        return 1
    else:
        return 2
titles['Skill'] = titles.apply(skill, axis=1)
titles.drop('Salary', axis=1, inplace=True)


In [923]:
df = pd.merge(df, titles, on='Job Titles', how='left')

Removing unnecessary columns

In [926]:
df.drop(['Name', 'Job Titles'], axis=1, inplace=True)

Encoding Full-time or Part-time employees

In [927]:
df['Full-time'] = df.apply(lambda row: 1 if row['Full or Part-Time'] == 'F' else 0, axis=1)
df.drop('Full or Part-Time', axis=1, inplace=True)

Encoding Salaried or Hourly employees

In [928]:
df['Salaried'] = df.apply(lambda row: 1 if row['Salary or Hourly'] == 'SALARY' else 0, axis=1)
df.drop('Salary or Hourly', axis=1, inplace=True)

One-hot encoding the Department feature

In [929]:
encoder = OneHotEncoder(sparse_output=False)

one_hot = pd.DataFrame(encoder.fit_transform(df[['Department']]), columns=encoder.get_feature_names_out(['Department']))
df = pd.concat([df, one_hot], axis=1)
df.drop(['Department', 'Department_BOARD OF ELECTION COMMISSIONERS'], axis=1, inplace=True)

Correlation of all features with the target variable

In [943]:
df.corr()['Salary'].sort_values()

Department_CHICAGO PUBLIC LIBRARY                                      -0.238119
Department_DEPARTMENT OF FAMILY AND SUPPORT SERVICES                   -0.206442
Department_OFFICE OF EMERGENCY MANAGEMENT AND COMMUNICATIONS           -0.183349
Department_CITY COUNCIL                                                -0.138442
Department_DEPARTMENT OF STREETS AND SANITATION                        -0.117482
Department_CHICAGO DEPARTMENT OF AVIATION                              -0.072709
Department_DEPARTMENT OF FINANCE                                       -0.059745
Department_CHICAGO ANIMAL CARE AND CONTROL                             -0.042310
Department_OFFICE OF CITY CLERK                                        -0.035490
Department_DEPARTMENT OF HOUSING                                       -0.021570
Department_MAYORS OFFICE FOR PEOPLE WITH DISABILITIES                  -0.020068
Department_DEPARTMENT OF ADMINISTRATIVE HEARING                        -0.014283
Department_DEPARTMENT OF BUS

In [931]:
newdf = df.copy()
X = newdf.iloc[:, 1:]
y = newdf.iloc[:, 0:1]

Train-test splitting

In [932]:
train_X, test_X, train_y, test_y = train_test_split(X,y, random_state=1)

Random-Forest model

In [933]:
randomforests = RandomForestRegressor(n_estimators=200, random_state=1)
randomforests.fit(train_X, train_y)
pred_y = randomforests.predict(test_X)
randomforest_accuracy = mean_absolute_percentage_error(test_y, pred_y)

  return fit_method(estimator, *args, **kwargs)


Gradient boosting model

In [934]:
gradboosted = GradientBoostingRegressor(n_estimators=200, random_state=1)
gradboosted.fit(train_X, train_y)
pred_y = gradboosted.predict(test_X)
gradboosted_accuracy = mean_absolute_percentage_error(test_y, pred_y)

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


SGD Regressor

In [935]:
sgd = SGDRegressor(random_state=1, learning_rate='invscaling', early_stopping=True)
sgd.fit(train_X, train_y)
pred_y = sgd.predict(test_X)
sgd_accuracy = mean_absolute_percentage_error(test_y, pred_y)

  y = column_or_1d(y, warn=True)


Artificial Neural Network

In [944]:
mlp = MLPRegressor(random_state=1, hidden_layer_sizes=(256,128,64,32,16,8), early_stopping=True, learning_rate='adaptive')
mlp.fit(train_X, train_y)
pred_y = mlp.predict(test_X)
mlp_accuracy = mean_absolute_percentage_error(test_y, pred_y)

  y = column_or_1d(y, warn=True)


Accuracies for different models 

In [937]:
print(randomforest_accuracy)

0.12144760295151728


In [938]:
print(gradboosted_accuracy)

0.12337476781142429


In [939]:
print(sgd_accuracy)

0.141441426315549


In [945]:
print(mlp_accuracy)

0.1297995063900087
