 ### XGBoost - Training Data balanced by upsampling
 
 Note that for this series of experiments with XGBoost the data was not scaled first.

In [21]:
import pandas as pd
import numpy as np
from joblib import dump

import xgboost as xgb
import sklearn.metrics

from collections import Counter

In [2]:
# load unscaled SMOTE and upsampled datasets

X_SMOTE = np.load('../data/processed/X_SMOTE.npy')
y_SMOTE = np.load('../data/processed/y_SMOTE.npy')

X_data_ups = np.load('../data/processed/X_data_ups.npy')
y_data_ups = np.load('../data/processed/y_data_ups.npy')


X_test = np.load('../data/processed/X_test_SMOTE.npy')
y_test = np.load('../data/processed/y_test_SMOTE.npy')

In [3]:
print(X_SMOTE.shape)
print(y_SMOTE.shape)
print(X_data_ups.shape)
print(y_data_ups.shape)
print(X_test.shape)
print(y_test.shape)

print(Counter(y_SMOTE))
print(Counter(y_data_ups))
print(Counter(y_test))

(10652, 19)
(10652,)
(10652, 19)
(10652,)
(1600, 19)
(1600,)
Counter({1: 5326, 0: 5326})
Counter({1: 5326, 0: 5326})
Counter({1: 1343, 0: 257})


### Experiment 1 - XGBoost - default parameters

In [7]:
xgboost1 = xgb.XGBClassifier() # upsampled data
xgboost2 = xgb.XGBClassifier() # SMOTE

In [8]:
xgboost1.fit(X_data_ups, y_data_ups)
xgboost2.fit(X_SMOTE, y_SMOTE)

In [9]:
xgboost1.predict_proba(X_test)
xgboost2.predict_proba(X_test)

array([[0.24916434, 0.75083566],
       [0.6719456 , 0.32805443],
       [0.01499593, 0.98500407],
       ...,
       [0.07802254, 0.92197746],
       [0.04678053, 0.9532195 ],
       [0.01347542, 0.9865246 ]], dtype=float32)

### Experiment 1 - AUROC

In [10]:
from sklearn.metrics import roc_auc_score

In [11]:
target = np.array(y_test)
Counter(target)

Counter({0: 257, 1: 1343})

In [13]:
df_pred_xgboost1 = pd.DataFrame(xgboost1.predict_proba(X_test))
df_pred_xgboost2 = pd.DataFrame(xgboost2.predict_proba(X_test))

pred_xgboost1 = np.array(df_pred_xgboost1[1])
pred_xgboost2 = np.array(df_pred_xgboost2[1])

print('xgboost1, upsampled data:', roc_auc_score(target, pred_xgboost1))
print('xgboost2, SMOTE:', roc_auc_score(target, pred_xgboost2))

xgboost1, upsampled data: 0.64397032023665
xgboost2, SMOTE: 0.6405486294404478


### Experiment 2 - Tune with Hyperopt

In [14]:
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin

In [15]:
space = {
    'max_depth' : hp.choice('max_depth', range(5, 20, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.05),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.05),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.05)
}

In [23]:
def objective(space):
    from sklearn.model_selection import cross_val_score
    
    xgboost = xgb.XGBClassifier(
        max_depth = int(space['max_depth']),
        learning_rate = space['learning_rate'],
        min_child_weight = space['min_child_weight'],
        subsample = space['subsample'],
        colsample_bytree = space['colsample_bytree']
    )
    
    auroc = cross_val_score(xgboost, X_data_ups, y_data_ups, cv=10, scoring="roc_auc").mean()

    return{'loss': 1-auroc, 'status': STATUS_OK }

In [22]:
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

In [24]:
best = fmin(
    fn=objective,   
    space=space,       
    algo=tpe.suggest,       
    max_evals=5
)

100% 5/5 [00:33<00:00,  6.69s/trial, best loss: 0.0053530839125736085]


In [25]:
print("Best: ", best)

Best:  {'colsample_bytree': 0.55, 'learning_rate': 0.25, 'max_depth': 10, 'min_child_weight': 2.0, 'subsample': 0.75}


In [26]:
xgboost3 = xgb.XGBClassifier(
    max_depth = best['max_depth'],
    learning_rate = best['learning_rate'],
    min_child_weight = best['min_child_weight'],
    subsample = best['subsample'],
    colsample_bytree = best['colsample_bytree']
)

In [27]:
xgboost3.fit(X_data_ups, y_data_ups)

### Experiment 2 - AUROC for optimised XGBoost3 - OVERFITTING ON TRAINING DATA

In [32]:
df_pred_xgboost3 = pd.DataFrame(xgboost3.predict_proba(X_test))

pred_xgboost3 = np.array(df_pred_xgboost3[1])

print('xgboost3, upsampled training data, test:', roc_auc_score(target, pred_xgboost3))

xgboost3, upsampled training data, test: 0.6340384353514839


In [33]:
df_pred_xgboost3 = pd.DataFrame(xgboost3.predict_proba(X_data_ups))

pred_xgboost3 = np.array(df_pred_xgboost3[1])

print('xgboost3, upsampled training data, train set:', roc_auc_score(y_data_ups, pred_xgboost3))

xgboost3, upsampled training data, train set: 1.0


In [34]:
df_pred_xgboost1 = pd.DataFrame(xgboost1.predict_proba(X_data_ups))

pred_xgboost1 = np.array(df_pred_xgboost1[1])

print('xgboost1, upsampled data, train set:', roc_auc_score(y_data_ups, pred_xgboost1))

xgboost1, upsampled data: 0.9993739044208693


### Experiment 3 - XGBoost guess some parameters to reduce overfittig

In [39]:
xgboost4 = xgb.XGBClassifier(colsample_bytree = 0.3, subsample = 0.25, max_depth = 5, n_estimators = 1000, learning_rate = 0.1)

In [40]:
xgboost4.fit(X_data_ups, y_data_ups)

In [41]:
df_pred_xgboost4 = pd.DataFrame(xgboost4.predict_proba(X_test))

pred_xgboost4 = np.array(df_pred_xgboost4[1])

print('xgboost4, upsampled training data, test:', roc_auc_score(target, pred_xgboost4))

xgboost4, upsampled training data, test: 0.634134045678558


In [42]:
df_pred_xgboost4 = pd.DataFrame(xgboost4.predict_proba(X_data_ups))

pred_xgboost4 = np.array(df_pred_xgboost4[1])

print('xgboost4, upsampled training data, test:', roc_auc_score(y_data_ups, pred_xgboost4))

xgboost4, upsampled training data, test: 0.9999576962446534


In [43]:
xgboost5 = xgb.XGBClassifier(colsample_bytree = 0.2, subsample = 0.1, max_depth = 3, n_estimators = 1000, learning_rate = 0.1, min_child_weight = 5)

In [44]:
xgboost5.fit(X_data_ups, y_data_ups)

In [45]:
df_pred_xgboost5 = pd.DataFrame(xgboost5.predict_proba(X_test))

pred_xgboost5 = np.array(df_pred_xgboost5[1])

print('xgboost5, upsampled training data, test:', roc_auc_score(target, pred_xgboost5))

xgboost5, upsampled training data, test: 0.630538517924039


In [46]:
df_pred_xgboost5 = pd.DataFrame(xgboost5.predict_proba(X_data_ups))

pred_xgboost5 = np.array(df_pred_xgboost5[1])

print('xgboost5, upsampled training data, test:', roc_auc_score(y_data_ups, pred_xgboost5))

xgboost5, upsampled training data, test: 0.9068105379782669


### Manual grid style search of important hyperparameters

In [9]:
ups_rf10 = RandomForestClassifier(n_estimators = 1000, min_samples_leaf = 5)
ups_rf10.fit(X_data_ups, y_data_ups)

ups_rf20 = RandomForestClassifier(n_estimators = 1000, min_samples_leaf = 10)
ups_rf20.fit(X_data_ups, y_data_ups)

ups_rf30 = RandomForestClassifier(n_estimators = 1000, min_samples_leaf = 20)
ups_rf30.fit(X_data_ups, y_data_ups)

ups_rf40 = RandomForestClassifier(n_estimators = 1000, min_samples_leaf = 40)
ups_rf40.fit(X_data_ups, y_data_ups)

ups_rf50 = RandomForestClassifier(n_estimators = 5000, min_samples_leaf = 40)
ups_rf50.fit(X_data_ups, y_data_ups)

ups_rf60 = RandomForestClassifier(n_estimators = 5000, min_samples_leaf = 100)
ups_rf60.fit(X_data_ups, y_data_ups)

ups_rf70 = RandomForestClassifier(n_estimators = 5000, min_samples_leaf = 200)
ups_rf70.fit(X_data_ups, y_data_ups)

ups_rf80 = RandomForestClassifier(n_estimators = 5000, min_samples_leaf = 500)
ups_rf80.fit(X_data_ups, y_data_ups)

ups_rf90 = RandomForestClassifier(n_estimators = 5000, min_samples_leaf = 200, max_features = 4)
ups_rf90.fit(X_data_ups, y_data_ups)

ups_rf100 = RandomForestClassifier(n_estimators = 5000, min_samples_leaf = 200, max_features = 5)
ups_rf100.fit(X_data_ups, y_data_ups)

ups_rf110 = RandomForestClassifier(n_estimators = 5000, min_samples_leaf = 200, max_features = 6)
ups_rf110.fit(X_data_ups, y_data_ups)

ups_rf120 = RandomForestClassifier(n_estimators = 5000, min_samples_leaf = 200, max_features = 7)
ups_rf120.fit(X_data_ups, y_data_ups)

ups_rf130 = RandomForestClassifier(n_estimators = 5000, min_samples_leaf = 200, max_features = 8)
ups_rf130.fit(X_data_ups, y_data_ups)

ups_rf140 = RandomForestClassifier(n_estimators = 5000, min_samples_leaf = 200, max_features = 3) # note max_features = 3 here 
ups_rf140.fit(X_data_ups, y_data_ups)


In [10]:
dump(ups_rf10, '../models/rf10_ups.joblib')
dump(ups_rf20, '../models/rf20_ups.joblib')
dump(ups_rf30, '../models/rf30_ups.joblib')
dump(ups_rf40, '../models/rf40_ups.joblib')
dump(ups_rf50, '../models/rf50_ups.joblib')
dump(ups_rf60, '../models/rf60_ups.joblib')
dump(ups_rf70, '../models/rf70_ups.joblib')
dump(ups_rf80, '../models/rf80_ups.joblib')
dump(ups_rf90, '../models/rf90_ups.joblib')
dump(ups_rf100, '../models/rf100_ups.joblib')
dump(ups_rf110, '../models/rf110_ups.joblib')
dump(ups_rf120, '../models/rf120_ups.joblib')
dump(ups_rf130, '../models/rf130_ups.joblib')
dump(ups_rf140, '../models/rf140_ups.joblib')

['../models/rf140_ups.joblib']

### Calculate AUROC for all models

In [11]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [12]:
target = np.array(y_test)
Counter(target)

Counter({0: 257, 1: 1343})

In [13]:
df_pred_ups_10 = pd.DataFrame(ups_rf10.predict_proba(X_test))
df_pred_ups_20 = pd.DataFrame(ups_rf20.predict_proba(X_test))
df_pred_ups_30 = pd.DataFrame(ups_rf30.predict_proba(X_test))
df_pred_ups_40 = pd.DataFrame(ups_rf40.predict_proba(X_test))
df_pred_ups_50 = pd.DataFrame(ups_rf50.predict_proba(X_test))
df_pred_ups_60 = pd.DataFrame(ups_rf60.predict_proba(X_test))
df_pred_ups_70 = pd.DataFrame(ups_rf70.predict_proba(X_test))
df_pred_ups_80 = pd.DataFrame(ups_rf80.predict_proba(X_test))
df_pred_ups_90 = pd.DataFrame(ups_rf90.predict_proba(X_test))
df_pred_ups_100 = pd.DataFrame(ups_rf100.predict_proba(X_test))
df_pred_ups_110 = pd.DataFrame(ups_rf110.predict_proba(X_test))
df_pred_ups_120 = pd.DataFrame(ups_rf120.predict_proba(X_test))
df_pred_ups_130 = pd.DataFrame(ups_rf130.predict_proba(X_test))
df_pred_ups_140 = pd.DataFrame(ups_rf140.predict_proba(X_test))

pred_ups_probs10 = np.array(df_pred_ups_10[1])
pred_ups_probs20 = np.array(df_pred_ups_20[1])
pred_ups_probs30 = np.array(df_pred_ups_30[1])
pred_ups_probs40 = np.array(df_pred_ups_40[1])
pred_ups_probs50 = np.array(df_pred_ups_50[1])
pred_ups_probs60 = np.array(df_pred_ups_60[1])
pred_ups_probs70 = np.array(df_pred_ups_70[1])
pred_ups_probs80 = np.array(df_pred_ups_80[1])
pred_ups_probs90 = np.array(df_pred_ups_90[1])
pred_ups_probs100 = np.array(df_pred_ups_100[1])
pred_ups_probs110 = np.array(df_pred_ups_110[1])
pred_ups_probs120 = np.array(df_pred_ups_120[1])
pred_ups_probs130 = np.array(df_pred_ups_130[1])
pred_ups_probs140 = np.array(df_pred_ups_140[1])

print('ups_rf10:', roc_auc_score(target, pred_ups_probs10))
print('ups_rf20:', roc_auc_score(target, pred_ups_probs20))
print('ups_rf30:', roc_auc_score(target, pred_ups_probs30))
print('ups_rf40:', roc_auc_score(target, pred_ups_probs40))
print('ups_rf50:', roc_auc_score(target, pred_ups_probs50))
print('ups_rf60:', roc_auc_score(target, pred_ups_probs60))
print('ups_rf70:', roc_auc_score(target, pred_ups_probs70))
print('ups_rf80:', roc_auc_score(target, pred_ups_probs80))
print('ups_rf90:', roc_auc_score(target, pred_ups_probs90)) 
print('ups_rf100:', roc_auc_score(target, pred_ups_probs100)) # top performer with upsampled data, still ery close to SMOTE, but a small improvemet
print('ups_rf110:', roc_auc_score(target, pred_ups_probs110))
print('ups_rf120:', roc_auc_score(target, pred_ups_probs120))
print('ups_rf130:', roc_auc_score(target, pred_ups_probs130))
print('ups_rf140:', roc_auc_score(target, pred_ups_probs140))


ups_rf10: 0.6852826733806363
ups_rf20: 0.6886493158067049
ups_rf30: 0.692798224545199
ups_rf40: 0.6978365990537475
ups_rf50: 0.6974309794843416
ups_rf60: 0.7024722512755286
ups_rf70: 0.7048798931482163
ups_rf80: 0.7027011366039791
ups_rf90: 0.7048943795614093
ups_rf100: 0.7049552224968203
ups_rf110: 0.7043294094468798
ups_rf120: 0.703796309441375
ups_rf130: 0.7033356415018355
ups_rf140: 0.7037296719406867


### Make preditions on unlabelled kaggle data

In [14]:
df_te = pd.read_csv('../data/raw/2022_test.csv')

In [15]:
df_te_cleaned = df_te.copy()

In [16]:
df_te_cleaned.drop('Id', axis=1, inplace=True)

In [17]:
print(list(df_te_cleaned.columns))
print(df_te_cleaned.shape)

['GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3P Made', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV']
(3799, 19)


In [18]:
Counter(ups_rf100.predict(df_te_cleaned))



Counter({0: 1458, 1: 2341})

In [19]:
ups_rf100.predict_proba(df_te_cleaned)



array([[0.56389165, 0.43610835],
       [0.52406616, 0.47593384],
       [0.23989001, 0.76010999],
       ...,
       [0.64897364, 0.35102636],
       [0.26165944, 0.73834056],
       [0.52751007, 0.47248993]])

In [20]:
df_pred_ups_rf100 = pd.DataFrame(ups_rf100.predict_proba(df_te_cleaned))



In [21]:
df_pred_ups_rf100[1].to_csv('../data/processed/ups_rf100.csv')