<a href="https://colab.research.google.com/github/ShivaniGawande/SalaryEstimator/blob/main/SalaryPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [256]:
#install optuna - framework for hyperparameter optimization
% pip install optuna



In [257]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from math import sqrt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from xgboost import XGBRegressor
import optuna

import warnings
warnings.filterwarnings('ignore')

In [258]:
df = pd.read_csv('https://raw.githubusercontent.com/ShivaniGawande/SalaryEstimator/main/data.csv')
df.head()

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,...,tensor,hadoop,tableau,bi,flink,mongo,google_an,job_title_sim,seniority_by_title,Degree
0,0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 - 1000,1973,...,0,0,1,1,0,0,0,data scientist,na,M
1,1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+,1984,...,0,0,0,0,0,0,0,data scientist,na,M
2,2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 - 1000,2010,...,0,0,0,0,0,0,0,data scientist,na,M
3,3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 - 5000,1965,...,0,0,0,0,0,0,0,data scientist,na,na
4,4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 - 200,1998,...,0,0,0,0,0,0,0,data scientist,na,na


In [259]:
def approach1_XGBRegressor():
    xgb = XGBRegressor()
    parameters = {'learning_rate': [.01, 0.1, .03], 
              'max_depth': [5, 7, 9],
              'n_estimators': [100,200,300]}

    xgb_grid = GridSearchCV(xgb, parameters, cv = 2, n_jobs = 5, verbose=True)
    return xgb_grid

In [260]:
def approach2_SVRRegressor():
    #SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1)
    svr = SVR()
    parameters = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf']}

    svr_grid = GridSearchCV(svr, parameters, cv=2, n_jobs = 5, verbose=True)
    return svr_grid


In [261]:

def approach3_RandomForestRegressor():
    #RandomForestRegressor(max_depth=2, random_state=0)
    rfr= RandomForestRegressor()
    parameters = {'max_depth': [40, 50, 60],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [150, 200]}

    rfr_grid = GridSearchCV(rfr, parameters, cv=2, n_jobs = 5, verbose=True)
    return rfr_grid

In [262]:
def feature_importance(model,x):
    df = pd.DataFrame(model.feature_importances_,columns=['importance'])
    df['features'] = x.columns
    df.reindex(x.columns, axis=1)

def model_accuracy(model, xtrain, xtest, ytrain, ytest):
    #evaluate
    scores = cross_val_score(model, xtrain, ytrain,cv=5)
    print("Mean cross-validation score: %.2f" % scores.mean())
    ypred = model.predict(xtest)
    mse = mean_squared_error(ytest, ypred)
    rmse = sqrt(mse)
    print("RMSE: %.2f" % rmse)
    y_pred_test = model.predict(xtest)

def train_model(x,y, model):
    xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.2)
    model.fit(xtrain, ytrain)
    #print(" Best Params : ", model.best_params_)
    #print(xtest)
    return model, xtrain, xtest, ytrain, ytest

In [263]:
Y = df.iloc[:,19]
X = pd.concat([df.iloc[:,4],df.iloc[:,23:39],df.iloc[:,21],df.iloc[:,8],df.iloc[:,10],df.iloc[:,11],df.iloc[:,12],df.iloc[:,40:42]],axis = 1)
X["Job Location"] = X["Job Location"].astype("category")
X["Industry"]=X["Industry"].astype("category")
X["Sector"] = X["Sector"].astype("category")
X["seniority_by_title"] = X["seniority_by_title"].astype("category")
X["Degree"] = X["Degree"].astype("category")
X["Size"] = X["Size"].astype("category")
X["Type of ownership"]=X["Type of ownership"].astype("category")

X["Job Location"] = X["Job Location"].cat.codes
X["Industry"] = X["Industry"].cat.codes
X["Sector"] = X["Sector"].cat.codes
X["seniority_by_title"] = X["seniority_by_title"].cat.codes
X["Degree"] = X["Degree"].cat.codes
X["Size"] = X["Size"].cat.codes
X["Type of ownership"] = X["Type of ownership"].cat.codes

print("Approach 1: Using XGB")
model = approach1_XGBRegressor()
model, xtrain, xtest, ytrain, ytest  = train_model(X,Y, model)
print("Model Accuracy on Test Dataset")
model_accuracy(model, xtrain, xtest, ytrain, ytest)
#print(X.columns)


print("Approach 2: Using SVR")
model = approach2_SVRRegressor()
model, xtrain, xtest, ytrain, ytest  = train_model(X,Y, model)
print("Model Accuracy on Test Dataset")
model_accuracy(model, xtrain, xtest, ytrain, ytest)
#print(X.columns)


print("Approach 3: Using Random Forest Regressor")
model = approach3_RandomForestRegressor()
model, xtrain, xtest, ytrain, ytest  = train_model(X,Y, model)
print("Model Accuracy on Test Dataset")
model_accuracy(model, xtrain, xtest, ytrain, ytest)
#print(X.columns)

Approach 1: Using XGB
Fitting 2 folds for each of 27 candidates, totalling 54 fits
Model Accuracy on Test Dataset
Fitting 2 folds for each of 27 candidates, totalling 54 fits
Fitting 2 folds for each of 27 candidates, totalling 54 fits
Fitting 2 folds for each of 27 candidates, totalling 54 fits
Fitting 2 folds for each of 27 candidates, totalling 54 fits
Fitting 2 folds for each of 27 candidates, totalling 54 fits
Mean cross-validation score: 0.53
RMSE: 25.29
Approach 2: Using SVR
Fitting 2 folds for each of 16 candidates, totalling 32 fits
Model Accuracy on Test Dataset
Fitting 2 folds for each of 16 candidates, totalling 32 fits
Fitting 2 folds for each of 16 candidates, totalling 32 fits
Fitting 2 folds for each of 16 candidates, totalling 32 fits
Fitting 2 folds for each of 16 candidates, totalling 32 fits
Fitting 2 folds for each of 16 candidates, totalling 32 fits
Mean cross-validation score: 0.38
RMSE: 28.32
Approach 3: Using Random Forest Regressor
Fitting 2 folds for each of 

In [264]:
xtrain, xtest, ytrain, ytest=train_test_split(X, Y, test_size=0.2)

In [265]:
def objective_XGB(trial):
   learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.25, log=True)
   reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-9, 100.0)
   reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-9, 100.0)
   subsample = trial.suggest_float("subsample", 0.0001, 1.0)
   colsample_bytree = trial.suggest_float("colsample_bytree", 0.0001, 1.0)
   max_depth = trial.suggest_int("max_depth", 1, 50)
   early_stopping_rounds = trial.suggest_int("early_stopping_rounds", 100, 700)
   n_estimators = trial.suggest_int("n_estimators", 0, 10000)

   model = XGBRegressor(
      random_state=50,
      n_estimators=n_estimators,
      learning_rate=learning_rate,
      reg_lambda=reg_lambda,
      reg_alpha=reg_alpha,
      subsample=subsample,
      colsample_bytree=colsample_bytree,
      max_depth=max_depth,
   )
   model.fit(xtrain,ytrain,
      early_stopping_rounds=early_stopping_rounds,
      eval_set=[(xtest,ytest)],
      verbose=1200
   )
   preds_valid = model.predict(xtest)
   rmse = mean_squared_error(ytest, preds_valid, squared=False)

   return rmse



In [266]:
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_XGB, n_trials=10)

optuna_params_xgb = study_xgb.best_params
optuna_params_xgb

[32m[I 2022-05-17 23:33:35,123][0m A new study created in memory with name: no-name-b9f9baeb-16ba-41cd-991c-b0d73531895c[0m


[0]	validation_0-rmse:99.4107
Will train until validation_0-rmse hasn't improved in 570 rounds.
Stopping. Best iteration:
[520]	validation_0-rmse:20.1374



[32m[I 2022-05-17 23:33:38,470][0m Trial 0 finished with value: 20.137381508171906 and parameters: {'learning_rate': 0.04726755128546075, 'reg_lambda': 5.97004126542331e-05, 'reg_alpha': 0.000358721959048697, 'subsample': 0.12360453852921172, 'colsample_bytree': 0.8972085500187995, 'max_depth': 14, 'early_stopping_rounds': 570, 'n_estimators': 4724}. Best is trial 0 with value: 20.137381508171906.[0m


[0]	validation_0-rmse:97.5527
Will train until validation_0-rmse hasn't improved in 652 rounds.
Stopping. Best iteration:
[133]	validation_0-rmse:26.9855



[32m[I 2022-05-17 23:33:40,285][0m Trial 1 finished with value: 26.985543283685082 and parameters: {'learning_rate': 0.06761569834995938, 'reg_lambda': 1.4315731964123033e-09, 'reg_alpha': 2.9973007335183262e-06, 'subsample': 0.042451568134273296, 'colsample_bytree': 0.30657007504403594, 'max_depth': 17, 'early_stopping_rounds': 652, 'n_estimators': 6753}. Best is trial 0 with value: 20.137381508171906.[0m


[0]	validation_0-rmse:103.943
Will train until validation_0-rmse hasn't improved in 431 rounds.
[1200]	validation_0-rmse:72.1714
[2400]	validation_0-rmse:52.2818
[3053]	validation_0-rmse:44.922


[32m[I 2022-05-17 23:33:47,351][0m Trial 2 finished with value: 44.92199951846483 and parameters: {'learning_rate': 0.00033707519057787753, 'reg_lambda': 5.024935220834377e-05, 'reg_alpha': 27.421477541468196, 'subsample': 0.14996735632304672, 'colsample_bytree': 0.6156270452581394, 'max_depth': 27, 'early_stopping_rounds': 431, 'n_estimators': 3054}. Best is trial 0 with value: 20.137381508171906.[0m


[0]	validation_0-rmse:103.954
Will train until validation_0-rmse hasn't improved in 120 rounds.
[863]	validation_0-rmse:88.1936


[32m[I 2022-05-17 23:33:51,437][0m Trial 3 finished with value: 88.19361358433392 and parameters: {'learning_rate': 0.00019991789685528253, 'reg_lambda': 0.0030495477136555705, 'reg_alpha': 1.008108035578421e-06, 'subsample': 0.3534411716204735, 'colsample_bytree': 0.46508753984280243, 'max_depth': 9, 'early_stopping_rounds': 120, 'n_estimators': 864}. Best is trial 0 with value: 20.137381508171906.[0m


[0]	validation_0-rmse:103.764
Will train until validation_0-rmse hasn't improved in 468 rounds.
[1200]	validation_0-rmse:28.1348
[2400]	validation_0-rmse:25.7435
[3600]	validation_0-rmse:25.3214
Stopping. Best iteration:
[3333]	validation_0-rmse:25.2847



[32m[I 2022-05-17 23:34:00,427][0m Trial 4 finished with value: 25.284680132344512 and parameters: {'learning_rate': 0.002086210864471714, 'reg_lambda': 0.37763880389677756, 'reg_alpha': 5.906909246171006e-06, 'subsample': 0.042545641787094086, 'colsample_bytree': 0.8775329473528756, 'max_depth': 39, 'early_stopping_rounds': 468, 'n_estimators': 7734}. Best is trial 0 with value: 20.137381508171906.[0m


[0]	validation_0-rmse:103.389
Will train until validation_0-rmse hasn't improved in 624 rounds.
[1200]	validation_0-rmse:21.3382
[2400]	validation_0-rmse:19.3805
[3555]	validation_0-rmse:18.6051


[32m[I 2022-05-17 23:34:05,256][0m Trial 5 finished with value: 18.605115367520845 and parameters: {'learning_rate': 0.0058880846508205095, 'reg_lambda': 2.3179567508212708e-05, 'reg_alpha': 14.609890147496435, 'subsample': 0.9351439537274974, 'colsample_bytree': 0.17244443311936075, 'max_depth': 26, 'early_stopping_rounds': 624, 'n_estimators': 3556}. Best is trial 5 with value: 18.605115367520845.[0m


[0]	validation_0-rmse:103.923
Will train until validation_0-rmse hasn't improved in 593 rounds.
[1200]	validation_0-rmse:58.5609
[2400]	validation_0-rmse:35.5346
[3600]	validation_0-rmse:24.8738
[4183]	validation_0-rmse:22.2689


[32m[I 2022-05-17 23:34:17,781][0m Trial 6 finished with value: 22.268905397786714 and parameters: {'learning_rate': 0.0005109687909174174, 'reg_lambda': 1.7188517358453746e-07, 'reg_alpha': 0.0008006993316278407, 'subsample': 0.6121662846103842, 'colsample_bytree': 0.4223544535917725, 'max_depth': 20, 'early_stopping_rounds': 593, 'n_estimators': 4184}. Best is trial 5 with value: 18.605115367520845.[0m


[0]	validation_0-rmse:103.655
Will train until validation_0-rmse hasn't improved in 662 rounds.
[1200]	validation_0-rmse:23.22
[2400]	validation_0-rmse:21.2402
[3207]	validation_0-rmse:20.354


[32m[I 2022-05-17 23:34:23,196][0m Trial 7 finished with value: 20.34438230641845 and parameters: {'learning_rate': 0.003075970145486491, 'reg_lambda': 1.5574173305596763e-09, 'reg_alpha': 0.1816868115691729, 'subsample': 0.10573479446428721, 'colsample_bytree': 0.4433817407650103, 'max_depth': 43, 'early_stopping_rounds': 662, 'n_estimators': 3208}. Best is trial 5 with value: 18.605115367520845.[0m


[0]	validation_0-rmse:100.001
Will train until validation_0-rmse hasn't improved in 168 rounds.
Stopping. Best iteration:
[115]	validation_0-rmse:27.8066



[32m[I 2022-05-17 23:34:23,616][0m Trial 8 finished with value: 27.806608215099104 and parameters: {'learning_rate': 0.04503277938667517, 'reg_lambda': 4.4957111519981114e-07, 'reg_alpha': 4.083932360220303e-06, 'subsample': 0.02635609588419648, 'colsample_bytree': 0.6063753260052993, 'max_depth': 25, 'early_stopping_rounds': 168, 'n_estimators': 3213}. Best is trial 5 with value: 18.605115367520845.[0m


[0]	validation_0-rmse:103.538
Will train until validation_0-rmse hasn't improved in 436 rounds.
[1200]	validation_0-rmse:17.2295
Stopping. Best iteration:
[883]	validation_0-rmse:17.1209



[32m[I 2022-05-17 23:34:29,570][0m Trial 9 finished with value: 17.120895787476243 and parameters: {'learning_rate': 0.004151562084116753, 'reg_lambda': 1.5846583488222143e-08, 'reg_alpha': 4.082996757423279e-08, 'subsample': 0.986084344319199, 'colsample_bytree': 0.8740989338944797, 'max_depth': 45, 'early_stopping_rounds': 436, 'n_estimators': 4936}. Best is trial 9 with value: 17.120895787476243.[0m


{'colsample_bytree': 0.8740989338944797,
 'early_stopping_rounds': 436,
 'learning_rate': 0.004151562084116753,
 'max_depth': 45,
 'n_estimators': 4936,
 'reg_alpha': 4.082996757423279e-08,
 'reg_lambda': 1.5846583488222143e-08,
 'subsample': 0.986084344319199}

In [267]:
print("Approach 1: Using XGBoost ")
model = XGBRegressor( early_stopping_rounds=optuna_params_xgb['early_stopping_rounds'],
      n_estimators=optuna_params_xgb['n_estimators'],
      learning_rate=optuna_params_xgb['learning_rate'],
      reg_lambda=optuna_params_xgb['reg_lambda'],
      reg_alpha=optuna_params_xgb['reg_alpha'],
      subsample=optuna_params_xgb['subsample'],
      colsample_bytree=optuna_params_xgb['colsample_bytree'],
      max_depth=optuna_params_xgb['max_depth'],)
model, xtrain, xtest, ytrain, ytest  = train_model(X,Y, model)
#feature_importance(model,X)
print("Model Accuracy on Test Dataset")
model_accuracy(model, xtrain, xtest, ytrain, ytest)
#print(X.columns)


Approach 1: Using XGBoost 
Model Accuracy on Test Dataset
Mean cross-validation score: 0.53
RMSE: 22.45
