In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv("salaries_final.csv")

In [3]:
data.head()

Unnamed: 0,Year,Name,Primary Job Title,Base Pay,Department,College
0,2010,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
1,2011,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
2,2012,"Abaied, Jamie L.",Assistant Professor,65229.0,Department of Psychological Science,CAS
3,2013,"Abaied, Jamie L.",Assistant Professor,66969.0,Department of Psychological Science,CAS
4,2014,"Abaied, Jamie L.",Assistant Professor,68658.0,Department of Psychological Science,CAS


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14470 entries, 0 to 14469
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               14470 non-null  int64  
 1   Name               14470 non-null  object 
 2   Primary Job Title  14470 non-null  object 
 3   Base Pay           14470 non-null  float64
 4   Department         14470 non-null  object 
 5   College            14470 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 678.4+ KB


### Preprocessing

In [5]:
def preprocess_input(df):
    df = df.copy()
    
    # drop name column
    df = df.drop('Name', axis=1)
    
    # shuffle the data
    df = df.sample(frac=1.0).reset_index(drop=True)
    
    # Split df into x and y
    y = df['Base Pay']
    X = df.drop('Base Pay', axis=1)
    
    return X, y

In [6]:
X, y = preprocess_input(data)

In [7]:
X.head()

Unnamed: 0,Year,Primary Job Title,Department,College
0,2012,Assistant Professor,Department of Orthopaedics & Rehabilitation,COM
1,2014,Senior Lecturer,Department of Romance Languages,CAS
2,2013,Associate Professor,Department of Sociology,CAS
3,2017,Clinical Professor,Department of Communication Sci & Disorders,CNHS
4,2018,Assistant Professor,Department of Med-Gen Internal Med,COM


In [8]:
y

0         30000.0
1         52515.0
2         79515.0
3         87680.0
4         52300.0
           ...   
14465     24000.0
14466     30000.0
14467     12159.0
14468    125942.0
14469     89283.0
Name: Base Pay, Length: 14470, dtype: float64

### Building Pipeline

In [9]:
pd.get_dummies(X['College'])

Unnamed: 0,Business,CALS,CAS,CEMS,CESS,CNHS,COM,Department of Ext,LCOMEO,Learning and Info Tech,Library,RSENR
0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14465,0,0,0,0,0,0,1,0,0,0,0,0
14466,0,0,0,0,0,0,1,0,0,0,0,0
14467,0,0,0,0,1,0,0,0,0,0,0,0
14468,0,0,1,0,0,0,0,0,0,0,0,0


In [10]:
def build_pipeline(regressor):
    
    nominal_transformer = Pipeline(steps=[
        ("onehot", OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(transformers=[
                                    ('nominal', nominal_transformer, ['Primary Job Title', 'Department', 'College'])
                                     ], remainder='passthrough')
   
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('regressor', regressor)
    ])
    
    return model

In [11]:
#model = build_pipeline(Ridge(alpha=10.0))
#model.fit(X, y)

In [12]:
#model.score(X, y)

In [21]:
model = {
    "Linear Regression (Ridge)": build_pipeline(Ridge()),
    "            Decision Tree": build_pipeline(DecisionTreeRegressor()),
    "           Neural Network": build_pipeline(MLPRegressor()),
    "            Random Forest": build_pipeline(RandomForestRegressor()),
    "        Gradient Boosting": build_pipeline(GradientBoostingRegressor())
}

### Model Selection (K-Fold CV)

In [22]:
def evalution_model(model, X, y):
    
    kf = KFold(n_splits=5)
    rmses = []
    r2s = []
    
    for train_idx, test_idx in kf.split(X):
        # fit model
        model.fit(X.iloc[train_idx, :], y.iloc[train_idx])
        
        # Make prediction
        pred = model.predict(X.iloc[test_idx, :])
        
        # calculate RMSE
        rmse = np.sqrt(np.mean((y.iloc[test_idx] - pred)**2))
        rmses.append(rmse)
        
        # calculate r*2
        r2 = 1 - (np.sum(y.iloc[test_idx] - pred)**2) / np.sum((y.iloc[test_idx] - y.iloc[test_idx].mean()**2))
        r2s.append(r2)
        
    # return avg RMSE and R2
    return np.mean(rmses), np.mean(r2s)

In [15]:
for name, model in model.items():
    print(name + " R^2: {:.5f}".format(evalution_model(model, X, y)[1]))

Linear Regression (Ridge) R^2: 1.18505
            Decision Tree R^2: 1.20289
           Neural Network R^2: 6.25315
            Random Forest R^2: 1.23351
        Gradient Boosting R^2: 1.19349


In [23]:
for name, model in model.items():
    print(name + " RMSE: {:.2f}".format(evalution_model(model, X, y)[0]))

Linear Regression (Ridge) RMSE: 28484.53
            Decision Tree RMSE: 30011.44
           Neural Network RMSE: 30917.86
            Random Forest RMSE: 28809.47
        Gradient Boosting RMSE: 31589.53
