In [95]:
%matplotlib inline
# OPTIONAL: Load the "autoreload" extension so that the code can change
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [96]:
import os
import sys
from pathlib import Path
import urllib
import zipfile
from typing import Dict
from feature_engine import encoding, imputation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import (
    base,
    compose,
    datasets,
    ensemble,
    metrics,
    model_selection,
    pipeline,
    preprocessing,
)
import scikitplot
import xgboost as xgb
import yellowbrick.model_selection
from yellowbrick import classifier
import dtreeviz
from hyperopt import fmin, tpe, hp, Trials

In [None]:
from src.data.Utils import Utils
from src.data.KagTransformer import TweakKagTransformer

In [None]:
plt.style.use("ggplot")

# Set Variables

In [None]:
url: str = (
    "https://github.com/mattharrison/datasets/raw/master/data/" "kaggle-survey-2018.zip"
)
folder_name: str = "kaggle-survey-2018.zip"
member_name: str = "multipleChoiceResponses.csv"

In [None]:
# Paths
HOME: Path = Path.cwd().parents
data_folder: str = HOME[1] / f"data/raw/{folder_name}"
figures_folder: str = HOME[1] / "reports/figures"

### Datasets

In [None]:
raw = Utils.extract_zip(src=url, dst=data_folder, member_name=member_name)

## Create raw X and raw y

In [None]:
kag_X, kag_y = Utils.get_rawx_y(df=raw, y_col="Q6")

## Splitdata

In [None]:
kag_X_train, kag_X_test, kag_y_train, kag_y_test = model_selection.train_test_split(
    kag_X, kag_y, test_size=0.3, random_state=42, stratify=kag_y
)

## Trasform X with pipeline

In [None]:
kag_pl = pipeline.Pipeline(
    [
        ("tweak", TweakKagTransformer()),
        (
            "cat",
            encoding.OneHotEncoder(
                top_categories=5, drop_last=True, variables=["Q1", "Q3", "major"]
            ),
        ),
        (
            "num_impute",
            imputation.MeanMedianImputer(
                imputation_method="median", variables=["education", "years_exp"]
            ),
        ),
    ]
)

In [None]:
X_train = kag_pl.fit_transform(kag_X_train)
X_test = kag_pl.transform(kag_X_test)

## Transform y with label encoder

In [None]:
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(kag_y_train)
y_train = label_encoder.transform(kag_y_train)
y_test = label_encoder.transform(kag_y_test)

# Combined data for cross validation/etc

In [None]:
X = pd.concat([X_train, X_test])
y = pd.Series([*y_train, *y_test], index=X.index)

## A boosted model

In [None]:
xg_oob = xgb.XGBClassifier()

In [None]:
xg_oob.fit(X_train, y_train)

In [None]:
xg_oob.score(X_test, y_test)

*Let's try w/ depth of 2 and 2 trees*

In [None]:
xg2 = xgb.XGBClassifier(max_depth=2, n_estimators=2)

In [None]:
xg2.fit(X_train, y_train)

In [None]:
xg2.score(X_test, y_test)

In [None]:
viz = dtreeviz.model(
    xg2,
    X_train=X,
    y_train=y,
    target_name="Job",
    feature_names=list(X_train.columns),
    class_names=["DS", "SE"],
    tree_index=0,
)
viz.view(depth_range_to_display=[0, 2])

In [None]:
xgb.plot_tree(xg2, num_trees=0)

## Early Stopping
Reduce overfitting.

In [None]:
# Defaults
xg = xgb.XGBClassifier()

In [None]:
xg.fit(X_train, y_train)

In [None]:
xg.score(X_test, y_test)

In [None]:
# Customized early stopping
xg = xgb.XGBClassifier(early_stopping_rounds=20)

In [None]:
xg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

In [None]:
xg.best_ntree_limit

### Plotting Tree performance
Validation_0 is for training data
Validation_1 is for testing data

In [None]:
results = xg.evals_result()

In [None]:
results

In [None]:
# Testing score is best at 13 trees
fig, ax = plt.subplots(figsize=(8, 4))
ax = (
    pd.DataFrame(
        {
            "training": results["validation_0"]["logloss"],
            "testing": results["validation_1"]["logloss"],
        }
    )
    .assign(ntrees=lambda adf: range(1, len(adf) + 1))
    .set_index("ntrees")
    .plot(figsize=(5, 4), ax=ax, title="eval_results with early stopping")
)
ax.annotate(
    "best number \nof trees (13)",
    xy=(13, 0.498),
    xytext=(20, 0.42),
    arrowprops={"color": "k"},
)
ax.set_xlabel("ntrees")
plt.show()

In [None]:
# using a value from early stopping gives same result
xg13 = xgb.XGBClassifier(n_estimators=13)

In [None]:
xg13.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

In [None]:
xg13.score(X_test, y_test)

In [None]:
# No early stopping, uses all stimators
xg_no_est = xgb.XGBClassifier()

In [None]:
xg_no_est.fit(X_train, y_train)

In [None]:
xg_no_est.score(X_test, y_test)

In [None]:
xg_error = xgb.XGBClassifier(early_stopping_rounds=20, eval_metric="error")

In [None]:
xg_error.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])

In [None]:
xg_error.score(X_test, y_test)

In [None]:
xg_error.best_ntree_limit

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
yellowbrick.model_selection.validation_curve(
    xgb.XGBClassifier(),
    X_train,
    y_train,
    param_name="gamma",
    param_range=[0, 0.5, 1, 5, 10, 20, 30],
    n_jobs=1,
    ax=ax,
)
plt.show()

### Learning Rate

In [None]:
kg_lr1 = xgb.XGBClassifier(learning_rate=1, max_depth=2)

In [None]:
kg_lr1.fit(X_train, y_train)

In [None]:
xgb.plot_tree(kg_lr1, num_trees=0)

In [None]:
# check impact of learning weight on scores
xg_lr_001 = xgb.XGBClassifier(learning_rate=0.01, max_depth=2)

In [None]:
xg_lr_001.fit(X_train, y_train)

In [None]:
xgb.plot_tree(xg_lr_001, num_trees=0)

## Grid Search

In [None]:
params: Dict = {
    "reg_lambda": [0],  # No effect
    "learning_rate": [0.1, 0.3],  # makes each boost more conservative
    "subsample": [0.7, 1],
    "max_depth": [2, 3],
    "random_state": [42],
    "n_jobs": [-1],
    "n_estimators": [200],
}

In [None]:
xgb2 = xgb.XGBClassifier(early_stopping_rounds=5)

In [None]:
cv = model_selection.GridSearchCV(xgb2, params, cv=3, n_jobs=-1).fit(
    X_train, y_train, eval_set=[(X_test, y_test)], verbose=50
)

In [None]:
cv.best_params_

In [None]:
xgb_grid = xgb.XGBClassifier(**cv.best_params_, early_stopping_rounds=50)

In [None]:
xgb_grid.fit(
    X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=10
)

*vs default*

In [None]:
xgb_def = xgb.XGBClassifier(early_stopping_rounds=50)

In [None]:
xgb_def.fit(
    X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=10
)

In [None]:
xgb_def.score(X_test, y_test), xgb_grid.score(X_test, y_test)

*cross validation*

In [89]:
results_default = model_selection.cross_val_score(xgb.XGBClassifier(), X=X, y=y, cv=4)

In [90]:
results_default

array([0.71352785, 0.72413793, 0.69496021, 0.74501992])

In [91]:
results_default.mean()

0.7194114787534214

In [92]:
results_grid = model_selection.cross_val_score(
    xgb.XGBClassifier(**cv.best_params_), X=X, y=y, cv=4
)

In [93]:
results_grid

array([0.74137931, 0.74137931, 0.74801061, 0.73572377])

In [94]:
results_grid.mean()

0.7416232505873941

# Hyperot

In [97]:
# 2 hours of trainning(matt harrison)
longs_params: Dict = {
    "colsample_bytree": 0.6874845219014455,
    "gamma": 0.06936323554883501,
    "learning_rate": 0.21439214284976907,
    "max_depth": 6,
    "min_child_weight": 0.6678357091609912,
    "reg_alpha": 3.2979862933185546,
    "reg_lambda": 7.850943400390477,
    "subsample": 0.999767483950891,
}

In [98]:
xg_ex = xgb.XGBClassifier(**longs_params, early_stopping_rounds=50, n_estimators=500)

In [99]:
xg_ex.fit(
    X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100
)

[0]	validation_0-logloss:0.65346	validation_1-logloss:0.65468
[100]	validation_0-logloss:0.45552	validation_1-logloss:0.49702
[120]	validation_0-logloss:0.45454	validation_1-logloss:0.49729


In [101]:
xg_ex.score(X_test, y_test)

0.7580110497237569