In [124]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, ShuffleSplit, GridSearchCV
from ngboost import NGBRegressor
from ngboost.distns import Poisson, LogNormal, Normal, Laplace
import shap
import pickle
from pathlib import Path
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [20, 10]

In [125]:
df = pd.read_parquet("data/processed.parquet")

In [126]:
X = df.drop(columns="Avg_Salary").values
y = df["Avg_Salary"].values

In [None]:
Dist_list = [Poisson, LogNormal, Normal, Laplace]
param_grid_list = []

param_grid = {
    'n_estimators': [100, 500, 1000],
    'minibatch_frac': [1.0, 0.5],
    'learning_rate': [0.01, 0.001]
}

for Dist in Dist_list
    model = NGBRegressor(Dist=LogNormal)
    grid_search = GridSearchCV(model, param_grid=param_grid, cv=5)
    grid_search.fit(X, y)
    grid_search_list.append(grid_search)

In [None]:
model.fit(X, y)

In [None]:
model.predict(X)

In [None]:
y_pred = model.pred_dist(X)

In [None]:
from scipy.stats import lognorm

In [None]:
np.exp(np.log(mu) + s**2/2)

In [None]:
y_pred.params

In [None]:
model.pred_dist(X.iloc[0:1])

In [None]:
y_dists.params["s"]

In [None]:
# Predict
y_dists = model.pred_dist(X.iloc[4:5])
#mu  = np.round(y_dists.params["loc"])/1000
#sigma = np.round(y_dists.params["scale"])/1000

In [None]:
mu  = np.log(y_dists.params["scale"])
s  = y_dists.params["s"]

In [None]:
x = np.linspace(1, 80000, 1000)
pdf = 1/(x * s * np.sqrt(2*np.pi)) * np.exp(-((np.log(x)-mu)**2)/2*s**2)

In [None]:
lognorm.cdf(x[0], s,scale=np.exp(mu))

In [None]:
lognorm.pdf(x, s,scale=np.exp(mu))

In [None]:
pdf

In [None]:
file_path = Path("data/model.p")

with file_path.open("wb") as f:
    pickle.dump(model, f)

In [None]:
with file_path.open("rb") as f:
    model = pickle.load(f)

# Model Analysis

In [None]:
explainer = shap.TreeExplainer(model, model_output=0)
shap_values = explainer.shap_values(X)

In [None]:
shap.summary_plot(shap_values, X)

In [None]:
cols = df.columns[df.columns.str.contains("Language")]

In [None]:
df.groupby("Job_Role_Original")[cols].sum().sum(axis=1).values

In [None]:
100*df.groupby("Job_Role_Original")[cols].count().divide(df.groupby("Job_Role_Original")[cols].sum().sum(axis=1).values, axis=0).round(3)["Language_Lua"]

In [None]:
df.groupby("Job_Role_Original").Language_Perl.sum()

In [None]:
df["Language_SQL"]

In [None]:
shap.dependence_plot("Job_Role_Original", shap_values, X)

In [None]:
df.Language_SQL.value_counts()

In [None]:
df[df.Language_R==1].Working_Experience.mean()

In [None]:
df[df.Language_R==0].Working_Experience.mean()

In [None]:
df[(df.Language_R==1) & (df.Work_Company_Country=="Portugal")].Avg_Salary.mean()

In [None]:
df[(df.Language_R==0) & (df.Work_Company_Country=="Portugal")].Avg_Salary.mean()

In [None]:
import shap

# train an XGBoost model
X, y = shap.datasets.boston()
model = LGBMRegressor().fit(X, y)

# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
explainer = shap.Explainer(model)
shap_values = explainer(X)

# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[0])

In [None]:
shap.dependence_plot("Gold earned per min.", shap_values, X, interaction_index="Deaths per min.")