<a href="https://colab.research.google.com/github/PyBeginner1/UniversitySalaryPrediction/blob/main/UniversitySalaryprediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
data = pd.read_csv('/content/salaries_final.csv')

In [4]:
data

Unnamed: 0,Year,Name,Primary Job Title,Base Pay,Department,College
0,2010,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
1,2011,"Abaied, Jamie L.",Assistant Professor,64000.0,Department of Psychological Science,CAS
2,2012,"Abaied, Jamie L.",Assistant Professor,65229.0,Department of Psychological Science,CAS
3,2013,"Abaied, Jamie L.",Assistant Professor,66969.0,Department of Psychological Science,CAS
4,2014,"Abaied, Jamie L.",Assistant Professor,68658.0,Department of Psychological Science,CAS
...,...,...,...,...,...,...
14465,2016,"van der Vliet, Albert",Professor,163635.0,Department of Pathology&Laboratory Medicine,COM
14466,2017,"van der Vliet, Albert",Professor,175294.0,Department of Pathology&Laboratory Medicine,COM
14467,2018,"van der Vliet, Albert",Professor,191000.0,Department of Pathology&Laboratory Medicine,COM
14468,2019,"van der Vliet, Albert",Professor,196000.0,Department of Pathology&Laboratory Medicine,COM


In [5]:
data.isna().sum()

Year                 0
Name                 0
Primary Job Title    0
Base Pay             0
Department           0
College              0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14470 entries, 0 to 14469
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               14470 non-null  int64  
 1   Name               14470 non-null  object 
 2   Primary Job Title  14470 non-null  object 
 3   Base Pay           14470 non-null  float64
 4   Department         14470 non-null  object 
 5   College            14470 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 678.4+ KB


In [7]:
data.describe()

Unnamed: 0,Year,Base Pay
count,14470.0,14470.0
mean,2015.382308,70986.426381
std,3.37369,47287.46437
min,2009.0,1707.0
25%,2013.0,35000.0
50%,2016.0,63550.0
75%,2018.0,90529.25
max,2020.0,446429.0


In [37]:
def preprocess_inputs(df):
  df = df.copy()

  df = df.drop('Name', axis = 1)

  #suffle
  df = df.sample(frac=1.0).reset_index(drop = True)

  #split X & Y
  X = df.drop('Base Pay', axis = 1)
  Y = df['Base Pay']

  return X, Y

In [38]:
X, Y = preprocess_inputs(data)

In [39]:
X

Unnamed: 0,Year,Primary Job Title,Department,College
0,2020,Lecturer I,Department of Nutrition & Food Sciences,CALS
1,2010,Assistant Professor,Department of Biomedical and Health Sci,CNHS
2,2019,Associate Professor,Department of Anesthesiology,COM
3,2018,Assistant Professor,Department of Pediatrics,COM
4,2009,Professor,Department of Pharmacology,COM
...,...,...,...,...
14465,2011,Assistant Professor,Department of Anesthesiology,COM
14466,2012,Assistant Professor,Department of English,CAS
14467,2020,Assistant Professor,Department of Surg-Emergency Med,COM
14468,2020,Assistant Professor,Department of Surg-Emergency Med,COM


In [40]:
Y

0          6243.0
1         66641.0
2         24000.0
3         31624.0
4        113648.0
           ...   
14465     30000.0
14466     55259.0
14467     35000.0
14468     35000.0
14469     11836.0
Name: Base Pay, Length: 14470, dtype: float64

In [62]:
def build_pipeline(regressor):
    
    nominal_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('nominal', nominal_transformer, ['Primary Job Title', 'Department', 'College'])
    ], remainder='passthrough')
    
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('regressor', regressor)
    ])
    
    return model

In [63]:
build_pipeline(Ridge(alpha=10.0))

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('nominal',
                                                  Pipeline(memory=None,
                                                           steps=[('onehot',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=False))],
                                    

In [64]:
models = {
    'Linear Model' : build_pipeline(Ridge()),
    'Decision Tree' : build_pipeline(DecisionTreeRegressor()),
    'Neural Network' : build_pipeline(MLPRegressor()),
    'Random forest' : build_pipeline(RandomForestRegressor()),
    'Gradient Boosting' : build_pipeline(GradientBoostingRegressor())
}

In [65]:
def evaluate_model(model, X, Y):
    
    kf = KFold(n_splits=5)
    rmses = []
    r2s = []
    
    for train_idx, test_idx in kf.split(X):
        #fit model
        model.fit(X.iloc[train_idx, :], Y.iloc[train_idx])
        
        #predict
        pred = model.predict(X.iloc[test_idx, :])
        
        #RMSE
        rmse = np.sqrt(np.mean((Y.iloc[test_idx] - pred)**2))
        rmses.append(rmse)
        
        #R2
        r2 = 1 - (np.sum((Y.iloc[test_idx] - pred)**2) / np.sum((Y.iloc[test_idx] - Y.iloc[test_idx].mean())**2))
        r2s.append(r2)
        
    return np.mean(rmses), np.mean(r2s)

In [73]:
for name, model in models.items():
    print(name + " RMSE: {:.2f}".format(evaluate_model(model, X, Y)[0]))

Linear Model RMSE: 28376.47
Decision Tree RMSE: 30322.42
Neural Network RMSE: 30859.09
Random forest RMSE: 28968.34
Gradient Boosting RMSE: 31498.30


In [75]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(evaluate_model(model, X, Y)[1]))

Linear Model R^2: 0.63972
Decision Tree R^2: 0.58746
Neural Network R^2: 0.57401
Random forest R^2: 0.62470
Gradient Boosting R^2: 0.55605
