In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, ShuffleSplit, GridSearchCV
from ngboost import NGBRegressor
from ngboost.distns import LogNormal, Normal, Laplace
import pickle
from pathlib import Path

In [2]:
df = pd.read_parquet("../data/processed.parquet")

In [3]:
X = df.drop(columns="Avg_Salary").values
y = df["Avg_Salary"].values

# Grid Search

In [None]:
grid_search_results = {"NLL":{}, "neg_mean_absolute_error":{}, "neg_root_mean_squared_error":{}}

scoring_list = [None, "neg_mean_absolute_error", "neg_root_mean_squared_error"]
Dist_list = [LogNormal, Normal, Laplace]
param_grid = {
    'n_estimators': [100, 500, 1000],
    'minibatch_frac': [1.0, 0.5],
    'learning_rate': [0.01, 0.001]
}

for scoring in scoring_list:
    for Dist in Dist_list:
        model = NGBRegressor(Dist=Dist)
        grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, scoring=scoring)
        grid_search.fit(X, y)
        
        if scoring == None:
            scoring_name = "NLL"
        else:
            scoring_name = scoring
        
        
        grid_search_results[scoring_name][Dist.__name__] = grid_search

In [None]:
file_path = Path('../modelling/grid_search_results.p')

with file_path.open('wb') as f:
    pickle.dump(grid_search_results, f)

# Load Results

In [4]:
file_path = Path('../modelling/grid_search_results.p')

with file_path.open('rb') as f:
    grid_search_results = pickle.load(f)

In [5]:
scoring_list = [None, "neg_mean_absolute_error", "neg_root_mean_squared_error"]
Dist_list = [LogNormal, Normal, Laplace]

for scoring in scoring_list:
    if scoring == None:
            scoring_name = "NLL"
    else:
        scoring_name = scoring
    for Dist in Dist_list:
        print(Dist.__name__)
        print(grid_search_results[scoring_name][Dist.__name__].cv_results_["mean_test_score"].min())

LogNormal
10.658685461093802
Normal
10.881231681674663
Laplace
10.756277133263866
LogNormal
-13774.311079061177
Normal
-13831.688355733266
Laplace
-13343.608439297106
LogNormal
-18325.26758814557
Normal
-18353.07010645815
Laplace
-19052.057543243056


In [6]:
grid_search_results["NLL"]["LogNormal"].cv_results_

{'mean_fit_time': array([ 3.64415517, 14.43752427, 30.32611027,  3.38296824, 12.93274274,
        32.8774334 ,  2.90128026, 16.74485221, 41.19773874,  2.58764644,
        12.81727567, 30.21188016]),
 'std_fit_time': array([0.32482364, 0.20502714, 2.90501517, 0.80281969, 1.69234557,
        3.97338475, 0.09691201, 2.20128068, 7.02000768, 0.56852741,
        1.42673709, 1.91135071]),
 'mean_score_time': array([0.12084584, 0.26000309, 0.5943841 , 0.18752708, 0.27832623,
        0.94929776, 0.05956182, 0.28435802, 0.62656956, 0.06258545,
        0.28628683, 1.01541176]),
 'std_score_time': array([0.0708015 , 0.00385865, 0.13045965, 0.21624424, 0.01183601,
        0.39243267, 0.00352493, 0.02794688, 0.0701569 , 0.00798313,
        0.01115253, 0.55567136]),
 'param_learning_rate': masked_array(data=[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.001, 0.001,
                    0.001, 0.001, 0.001, 0.001],
              mask=[False, False, False, False, False, False, False, False,
                    

# Final Model

In [7]:
model = NGBRegressor(LogNormal, n_estimators=1000, learning_rate=0.01, minibatch_frac=0.5)
model.fit(X, y)

[iter 0] loss=11.0337 val_loss=0.0000 scale=1.0000 norm=0.6803
[iter 100] loss=10.7818 val_loss=0.0000 scale=2.0000 norm=1.1235
[iter 200] loss=10.6528 val_loss=0.0000 scale=1.0000 norm=0.5573
[iter 300] loss=10.6165 val_loss=0.0000 scale=1.0000 norm=0.5677
[iter 400] loss=10.5695 val_loss=0.0000 scale=1.0000 norm=0.5567
[iter 500] loss=10.5672 val_loss=0.0000 scale=1.0000 norm=0.5775
[iter 600] loss=10.5389 val_loss=0.0000 scale=1.0000 norm=0.5633
[iter 700] loss=10.5487 val_loss=0.0000 scale=1.0000 norm=0.5798
[iter 800] loss=10.5353 val_loss=0.0000 scale=1.0000 norm=0.5743
[iter 900] loss=10.4834 val_loss=0.0000 scale=1.0000 norm=0.5579


NGBRegressor(Dist=<class 'ngboost.distns.distn.Distn.uncensor.<locals>.DistWithUncensoredScore'>,
             minibatch_frac=0.5, n_estimators=1000,
             random_state=RandomState(MT19937) at 0x7FCB3D112D10)

In [8]:
file_path = Path("../modelling/model.p")

with file_path.open("wb") as f:
    pickle.dump(model, f)