# Setup

## Imports

In [52]:
from sklearn.cluster import KMeans as KM
from sklearn.ensemble import GradientBoostingRegressor as GBR
from helpers import displayAsDF
from joblib import load
from sklearn.linear_model import Lasso, LinearRegression as LR, Ridge
from sklearn.model_selection import GridSearchCV as GSCV, train_test_split as TTS
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import make_scorer, mean_squared_error as MSE
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor as DTR
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Variables

In [53]:
RANDOM_STATE = 1432
salaries = load("salaries.pkl")

## Helpers

In [54]:
MSEScorer = make_scorer(MSE, greater_is_better=False)
RMSE = lambda *args, **kwargs: np.sqrt(MSE(*args, **kwargs))

def sfsSupport(model, X, y, nfts):
  sfs = SFS(model, direction="backward", scoring=MSEScorer, n_features_to_select=nfts)
  sfs.fit(X, y)
  return sfs.support_

makeMask = lambda mask: lambda X: X.loc[:, mask]
joinCols = lambda cols: ", ".join(cols)

def inspectSFS(model, X_train, X_test, y_train, y_test, tt = lambda y: y):
  cols = []
  rmses = []
  length = X_train.shape[1]
  for n in range(length//2, length):
    # Backward feature elimination
    sfs_support = sfsSupport(model, X_train, y_train, n)
    X_mask = makeMask(sfs_support)
    model.fit(X_mask(X_train), y_train)  
    cols.append(joinCols(X_train.columns[sfs_support]))
    rmses.append(RMSE(tt(y_test), tt(model.predict(X_mask(X_test)))))
  
  model.fit(X_train, y_train)
  cols.append(joinCols(X_train.columns))
  rmses.append(RMSE(tt(y_test), tt(model.predict(X_test))))
  
  return pd.DataFrame(rmses, columns=["RMSE"], index=cols)

# Will be needed to "un-log" logged predictions for presenting scores
e = lambda y: np.e ** y

# Modelling

In [55]:
# salaries = pd.get_dummies(salaries)
X = salaries.drop(columns=["salary_in_usd"])
y = salaries["salary_in_usd"]

tts = TTS(X, y, test_size=1/3, random_state=RANDOM_STATE)
Xs = tts[:2]
ys = tts[2:]
y_logs = [y.agg(np.log) for y in ys]

## Exploration

### Linear Regression (LR)

In [56]:
lr = LR()
display(inspectSFS(lr, *tts))
display(inspectSFS(lr, *Xs, *y_logs, e))

Unnamed: 0,RMSE
"experience_level, company_location, ricl",42423.728905
"experience_level, company_location, company_size, ricl",42093.422407
"experience_level, job_title, company_location, company_size, ricl",42416.774707
"work_year, experience_level, job_title, company_location, company_size, ricl",42224.046134
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",42096.064031


Unnamed: 0,RMSE
"experience_level, company_location, ricl",41665.682245
"experience_level, company_location, company_size, ricl",40906.428919
"work_year, experience_level, company_location, company_size, ricl",40873.297718
"work_year, experience_level, job_title, company_location, company_size, ricl",40926.885415
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",40608.496594


### Ridge

In [57]:
ridge = Ridge()
display(inspectSFS(ridge, *tts))
display(inspectSFS(ridge, *Xs, *y_logs, e))

Unnamed: 0,RMSE
"experience_level, company_location, ricl",42500.607097
"experience_level, company_location, company_size, ricl",42172.021777
"experience_level, job_title, company_location, company_size, ricl",42488.701213
"work_year, experience_level, job_title, company_location, company_size, ricl",42263.156701
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",42129.790502


Unnamed: 0,RMSE
"experience_level, company_location, ricl",41819.587857
"experience_level, company_location, company_size, ricl",41070.367347
"work_year, experience_level, company_location, company_size, ricl",41058.161349
"work_year, experience_level, job_title, company_location, company_size, ricl",41105.989793
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",40788.878948


### Lasso

In [58]:
lasso = Lasso()
display(inspectSFS(lasso, *tts))
display(inspectSFS(lasso, *Xs, *y_logs, e))

Unnamed: 0,RMSE
"experience_level, company_location, ricl",42424.33731
"experience_level, company_location, company_size, ricl",42093.980762
"experience_level, job_title, company_location, company_size, ricl",42417.049597
"work_year, experience_level, job_title, company_location, company_size, ricl",42224.317972
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",42096.511161


Unnamed: 0,RMSE
"company_location, company_size, ricl",60907.716107
"remote_ratio, company_location, company_size, ricl",60907.716107
"job_title, remote_ratio, company_location, company_size, ricl",60907.716107
"experience_level, job_title, remote_ratio, company_location, company_size, ricl",60907.716107
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",60907.716107


### Decision Tree Regressor (DTR)

In [59]:
dtr = DTR()
display(inspectSFS(dtr, *tts))
display(inspectSFS(dtr, *Xs, *y_logs, e))

Unnamed: 0,RMSE
"experience_level, company_location, ricl",43369.062365
"experience_level, remote_ratio, company_location, ricl",45303.186478
"experience_level, remote_ratio, company_location, company_size, ricl",47549.201865
"work_year, experience_level, remote_ratio, company_location, company_size, ricl",47654.902362
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",49147.746488


Unnamed: 0,RMSE
"work_year, job_title, company_location",49974.824208
"work_year, experience_level, company_location, ricl",43288.815683
"work_year, experience_level, remote_ratio, company_location, ricl",44698.002523
"work_year, experience_level, job_title, remote_ratio, company_location, ricl",49131.441788
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",45087.91141


### Gradient Boosting Regressor (GBR)

In [60]:
gbr = GBR(random_state=RANDOM_STATE)
display(inspectSFS(gbr, *tts))
display(inspectSFS(gbr, *Xs, *y_logs, e))

Unnamed: 0,RMSE
"experience_level, job_title, company_location",45159.302209
"experience_level, job_title, company_location, ricl",42175.258574
"experience_level, job_title, company_location, company_size, ricl",40092.494243
"work_year, experience_level, job_title, company_location, company_size, ricl",40275.56449
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",40052.789192


Unnamed: 0,RMSE
"experience_level, job_title, company_location",45003.020954
"experience_level, job_title, company_location, ricl",42019.119062
"work_year, experience_level, job_title, company_location, ricl",42324.137049
"work_year, experience_level, job_title, company_location, company_size, ricl",41162.605303
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",40937.210887


### Support Vector Regression (SVR)

In [61]:
svr = SVR()
display(inspectSFS(svr, *tts))
display(inspectSFS(svr, *Xs, *y_logs, e))

Unnamed: 0,RMSE
"experience_level, company_location, ricl",58362.998955
"experience_level, company_location, company_size, ricl",58372.116942
"experience_level, remote_ratio, company_location, company_size, ricl",58379.044224
"experience_level, job_title, remote_ratio, company_location, company_size, ricl",58384.652798
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",58402.624842


Unnamed: 0,RMSE
"experience_level, job_title, company_location",44906.015061
"experience_level, job_title, company_location, company_size",41951.907254
"experience_level, job_title, company_location, company_size, ricl",40467.630102
"experience_level, job_title, remote_ratio, company_location, company_size, ricl",40292.601548
"work_year, experience_level, job_title, remote_ratio, company_location, company_size, ricl",58367.291076


## Shortlisting

The models that perform best are:

| No. | Model     | Features       | y logged | RMSE  |
| --- | --------- | -------------- | -------- | ----- |
| 1   | **GBR**   | All            | No       | 40053 |
| 2   | **SVR**   | No `work_year` | Yes      | 40293 |
| 3   | **LR**    | All            | Yes      | 40608 |
| 4   | **GBR**   | All            | Yes      | 40653 |
| 5   | **Ridge** | All            | Yes      | 40789 |

**LR** has no parameters that can be optimized further, but **GBR**, **SVR** & **Ridge** have potential for error reduction

## Hyperparameter Tuning

### GBR

In [62]:
gbr_pg = {
  "learning_rate": [.1, .01, .001, .0001],
  "n_estimators": np.arange(60, 120, 20),
  "max_depth": np.arange(3, 8, 1),
  "tol": [.1, .01, .001, .0001]
}
gbr_gscv = GSCV(gbr, gbr_pg, scoring=MSEScorer)
gbr_gscv.fit(Xs[0], tts[2])

gbr_gscv.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 60, 'tol': 0.1}

In [63]:
gbr_tuned = GBR(**gbr_gscv.best_params_)
gbr_tuned.fit(Xs[0], tts[2])
RMSE(tts[3], gbr_tuned.predict(Xs[1]))

40004.555140726574

### Ridge

In [64]:
ridge_pg = {
  "alpha": np.arange(1, 1.2, .01),
  "tol": np.arange(0, 1, .1)
}
ridge_gscv = GSCV(ridge, ridge_pg, scoring=MSEScorer)
ridge_gscv.fit(Xs[0], y_logs[0])

ridge_gscv.best_params_

{'alpha': 1.1900000000000002, 'tol': 0.0}

In [65]:
ridge_tuned = Ridge(**ridge_gscv.best_params_)
ridge_tuned.fit(Xs[0], y_logs[0])
RMSE(tts[3], e(ridge_tuned.predict(Xs[1])))

40824.70364430844

### SVR

In [66]:
svr_pg = {
  "gamma": ["scale", "auto"],
  "tol": np.arange(.12, .15, .005),
  "C": np.arange(.6, .9, .05),
  "epsilon": np.arange(.1, .5, .1)
}
svr_Xs = [X.drop(columns="work_year") for X in Xs]
svr_gscv = GSCV(svr, svr_pg, scoring=MSEScorer)
svr_gscv.fit(svr_Xs[0], y_logs[0])

svr_gscv.best_params_

{'C': 0.7500000000000001,
 'epsilon': 0.2,
 'gamma': 'scale',
 'tol': 0.14500000000000002}

In [67]:
svr_tuned = SVR(**svr_gscv.best_params_)
svr_tuned.fit(svr_Xs[0], y_logs[0])
RMSE(tts[3], e(svr_tuned.predict(svr_Xs[1])))

39685.803079430574

## The best performing model is:

**SVR** w/
- Features: No `work_year`
- y logged
- Params:
{
 'C': 0.75,
 'epsilon': 0.2,
 'gamma': 'scale',
 'tol': 0.145
}
- *RMSE: 39686*

# Exports

In [68]:
from joblib import dump

In [69]:
dump(svr_tuned, "../model.pkl")

['../model.pkl']