In [2]:
import numpy as np
import warnings
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score

In [21]:
# df = pd.read_csv('cleaned_updated_balanced_data.csv')
df = pd.read_csv('df_FINAL-5.csv')

In [4]:
features = ['confidence score', 'experience score',
       'games played prior on current day',  'winner_streak',
       'favorite fruit_prume',
       'favorite fruit_strawberry',  'duration round_seconds',  'before3_mean',
       'before3_min', 'before3_max', 'before2_mean', 'before2_min',
       'before2_max', 'before1_mean', 'before1_min', 'before1_max',
       'after1_mean', 'after1_min', 'after1_max', 'after2_mean', 'after2_min',
       'after2_max', 'after3_mean', 'after3_min', 'after3_max',
       'max_acc_value',  'time_diff_max_acc',]


In [6]:
features = [
    "max_acc_value",
    "games played prior on current day",
    "experience score",
    "after2_mean",
    "after3_min",
    "before1_mean",
    "before2_mean",
    "after2_min",
    "time_diff_max_acc",
    "after2_max",
    "duration round_seconds",
    "before3_mean",
    "after3_mean",
    "before3_max",
    "after3_max",
    "winner_streak",
    "after1_mean",
    "before2_min",
    "after1_max",
    "before3_min",
    "before1_min",
    "after1_min",
    "before2_max",
    "before1_max",
    "favorite fruit_prume",
    "favorite fruit_strawberry"
]

"""
Underneath the best features that were selected with the step up model when using the simulated data.
"""
# features = [
#     "max_acc_value",
#     "games played prior on current day",
#     "after2_mean",
#     "after2_max",
#     "after1_mean",
#     "before1_min",
# ]

In [42]:
"""
This is XGBoost code.
"""

# Train-test split on the entire dataset
X = df[features]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Assign class weights
class_weights = df['target'].sum() / df['target'].value_counts()
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
sample_weights = [class_weight_dict[y] for y in y_train]

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', )


"""
Turn the line underneath on to run the model WITH class weights
"""
xgb_model.fit(X_train, y_train, sample_weight=sample_weights) #

"""
Turn the line underneath on to run the model WITHOUT class weights
"""
# xgb_model.fit(X_train, y_train)

print("XGBOOST")
print(f'Feature(s): {features}')

# Evaluate the model
y_pred = xgb_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# If you want to calculate ROC-AUC for multiclass
y_test_bin = pd.get_dummies(y_test)
y_pred_bin = pd.get_dummies(y_pred)
roc_auc = roc_auc_score(y_test_bin, y_pred_bin, average='macro')
print(f'ROC-AUC Score: {roc_auc}')
print('')
print('')

# Calculate Weighted F1 Score
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Weighted F1 Score: {weighted_f1}')


XGBOOST
Feature(s): ['max_acc_value', 'games played prior on current day', 'after2_mean', 'after2_max', 'after1_mean', 'before1_min']

Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.30      0.30        20
           1       0.50      0.14      0.22         7
           2       0.88      0.88      0.88        16
           3       0.44      0.68      0.53        25
           4       0.00      0.00      0.00         9

    accuracy                           0.49        77
   macro avg       0.42      0.40      0.39        77
weighted avg       0.45      0.49      0.45        77

Confusion Matrix:
[[ 6  0  0 14  0]
 [ 4  1  0  2  0]
 [ 0  0 14  2  0]
 [ 6  1  1 17  0]
 [ 4  0  1  4  0]]
ROC-AUC Score: 0.6602616981451106


Weighted F1 Score: 0.4524260461760462


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [55]:
"""
This is the step up model - it only adds the given feature when it is improving the weighted F1 score of the model
"""

step_up_features = []
previous_model_f1 = 'test'
for feature in features:
    
    step_up_features_copy = step_up_features.copy()
    
    print(f'step_up_features = {step_up_features}')
    print(f'step_up_features_copy = {step_up_features_copy} \n')
    
    step_up_features_copy.append(feature)
    # Train-test split on the entire dataset
    X = df[step_up_features_copy]
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Class weights
    class_weights = df['target'].sum() / df['target'].value_counts()
    class_weight_dict = dict(zip(np.unique(y_train), class_weights))
    sample_weights = [class_weight_dict[y] for y in y_train]

    # XGBoost
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    
    """
    Turn the line underneath on to run the model WITH class weights
    """
    # xgb_model.fit(X_train, y_train, sample_weight=sample_weights) #
    
    """
    Turn the line underneath on to run the model WITHOUT class weights
    """
    xgb_model.fit(X_train, y_train)
    
    print("XGBOOST")
    print(f'Feature(s): {step_up_features_copy}')
    
    # Evaluate the model
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=UserWarning)  # Suppress UserWarnings if any
        y_pred = xgb_model.predict(X_test)
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

    
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # If you want to calculate ROC-AUC for multiclass
    y_test_bin = pd.get_dummies(y_test)
    y_pred_bin = pd.get_dummies(y_pred)
    roc_auc = roc_auc_score(y_test_bin, y_pred_bin, average='macro')
    print(f'ROC-AUC Score: {roc_auc}')
    print('')
    
    # Calculate Weighted F1 Score
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Weighted F1 Score: {weighted_f1}')
    
    if previous_model_f1 == 'test':
        step_up_features = step_up_features_copy
        previous_model_f1 = weighted_f1
    else:
        if weighted_f1 <= previous_model_f1:
            print(f'F1 previous model ({previous_model_f1}) >= F1 of current model ({weighted_f1})')
            print(f'Therefore, {feature} IS NOT added to the model.\n \n')
        else:
            print(f'F1 previous model ({previous_model_f1}) <= F1 of current model ({weighted_f1})')
            print(f'Feature {feature} IS added to the model. \n \n')
            step_up_features = step_up_features_copy
            previous_model_f1 = weighted_f1

print(f'final features = {step_up_features}')

# Here the final model is printed once again
X = df[step_up_features]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Class weights
class_weights = df['target'].sum() / df['target'].value_counts()
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
sample_weights = [class_weight_dict[y] for y in y_train]

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

"""
Turn the line underneath on to run the model WITH class weights
"""
# xgb_model.fit(X_train, y_train, sample_weight=sample_weights) #

"""
Turn the line underneath on to run the model WITHOUT class weights
"""
xgb_model.fit(X_train, y_train)

print("XGBOOST")
print(f'Feature(s): {step_up_features_copy}')

# Evaluate the model
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)  # Suppress UserWarnings if any
    y_pred = xgb_model.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# If you want to calculate ROC-AUC for multiclass
y_test_bin = pd.get_dummies(y_test)
y_pred_bin = pd.get_dummies(y_pred)
roc_auc = roc_auc_score(y_test_bin, y_pred_bin, average='macro')
print(f'ROC-AUC Score: {roc_auc}')
print('')

# Calculate Weighted F1 Score
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Weighted F1 Score: {weighted_f1}')

step_up_features = []
step_up_features_copy = [] 
XGBOOST
Feature(s): ['max_acc_value']

Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.10      0.13        20
           1       0.40      0.29      0.33         7
           2       0.75      0.56      0.64        16
           3       0.38      0.68      0.49        25
           4       0.40      0.22      0.29         9

    accuracy                           0.42        77
   macro avg       0.43      0.37      0.38        77
weighted avg       0.41      0.42      0.39        77

Confusion Matrix:
[[ 2  1  1 15  1]
 [ 1  2  0  4  0]
 [ 1  0  9  4  2]
 [ 5  1  2 17  0]
 [ 1  1  0  5  2]]
ROC-AUC Score: 0.6035468974497168

Weighted F1 Score: 0.38961038961038963
step_up_features = ['max_acc_value']
step_up_features_copy = ['max_acc_value'] 
XGBOOST
Feature(s): ['max_acc_value', 'games played prior on current day']

Classification Report:
              precision    recall  f1

In [8]:
"""
Parameter tuning code
NOTE. Make sure that if you want to tune the parameters on the model with the class weights, to also change that in this piece of code.
"""

# Train-test split on the entire dataset
X = df[features]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the model for tuning
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Define the grid of parameters to search
param_grid = {'learning_rate': [0.5, 0.6],
    'max_depth': [4, 5, 6],
    'min_child_weight': [1, 2],
    'gamma': [0, 0.1],
    'subsample': [1, 0.8],
    'colsample_bytree': [0.8, 1.0]
}


# Setup GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)

"""
Turn the line underneath on to run the model WITH class weights
"""
# grid_search.fit(X_train, y_train, sample_weight=sample_weights) #

"""
Turn the line underneath on to run the model WITHOUT class weights
"""
grid_search.fit(X_train, y_train)

# Get the best estimator
best_xgb_model = grid_search.best_estimator_

# Output best parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the best model
print("XGBOOST - Tuned Model")
print(f'Feature(s): {features}')

y_pred = best_xgb_model.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# If you want to calculate ROC-AUC for multiclass
y_test_bin = pd.get_dummies(y_test)
y_pred_bin = pd.get_dummies(y_pred)
roc_auc = roc_auc_score(y_test_bin, y_pred_bin, average='macro')
print(f'ROC-AUC Score: {roc_auc}')


Fitting 3 folds for each of 96 candidates, totalling 288 fits
Best parameters found:  {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.6, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 1}
XGBOOST - Tuned Model
Feature(s): ['max_acc_value', 'games played prior on current day', 'after2_mean', 'after2_max', 'after1_mean', 'before1_min']

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.50      0.40        20
           1       0.00      0.00      0.00         7
           2       0.76      0.81      0.79        16
           3       0.41      0.48      0.44        25
           4       0.00      0.00      0.00         9

    accuracy                           0.45        77
   macro avg       0.30      0.36      0.33        77
weighted avg       0.38      0.45      0.41        77

Confusion Matrix:
[[10  1  0  9  0]
 [ 4  0  0  3  0]
 [ 1  0 13  2  0]
 [11  0  2 12  0]
 [ 4  0  2  3  0]]
ROC-AUC Score: 0.6293550306

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
