## XGBoost Classifier

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings

In [3]:
warnings.filterwarnings('ignore')


In [4]:
df = pd.read_csv('../../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [5]:
train_data = df.loc[(df.season < 2016) & (df.season >= 2007)]
test_data = df.loc[df.season >= 2016]

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


In [6]:
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("Training Results: \n===============================")
    clf_report = classification_report(y_train, y_train_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

    print("Testing Results: \n===============================")
    clf_report = classification_report(y_test, y_test_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

### Use Lasso data

In [7]:
X_train_lasso = train_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train


X_test_lasso = test_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]

y_test_lasso = y_test

In [8]:
# XG Boost classifier
model = xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, 
min_child_weight=3, n_estimators=100)
model.fit(X_train_lasso, y_train_lasso)

start_time = time.time()

preds = model.predict(X_test_lasso)
test_score = model.score(X_test_lasso, y_test_lasso)



In [9]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_lasso, preds, target_names=target_names))
print("test score", test_score)


Συνολικός χρόνος fit και predict: 2.881075382232666 seconds
              precision    recall  f1-score   support

   home_loss       0.64      0.55      0.59      1935
    home_win       0.70      0.77      0.73      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.68      0.67      4583

test score 0.6779402138337334


In [10]:
evaluate(model, X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso)

Training Results: 
Confusion Matrix:
[[2143 1652]
 [1019 4506]]
Accuracy Score:
0.7134
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.56      0.62      3795
           1       0.73      0.82      0.77      5525

    accuracy                           0.71      9320
   macro avg       0.70      0.69      0.69      9320
weighted avg       0.71      0.71      0.71      9320

Testing Results: 
Confusion Matrix:
[[1063  872]
 [ 604 2044]]
Accuracy Score:
0.6779
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.55      0.59      1935
           1       0.70      0.77      0.73      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.68      0.67      4583



### XGB with Sequential Feature Selection

#### Forward

In [9]:
X_train_for_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_train_for_sfs = y_train



X_test_for_sfs = test_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_test_for_sfs = y_test


In [10]:
# XG Boost classifier
model = xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, 
min_child_weight=1, n_estimators=100)
model.fit(X_train_for_sfs, y_train_for_sfs)


start_time = time.time()

preds = model.predict(X_test_for_sfs)
test_score = model.score(X_test_for_sfs, y_test_for_sfs)



In [11]:
target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_for_sfs, preds, target_names=target_names))

y_fit = model.predict(X_test_for_sfs)

print("test score", test_score)

Συνολικός χρόνος fit και predict: 7.852690696716309 seconds
              precision    recall  f1-score   support

   home loss       0.63      0.53      0.58      1935
    home win       0.69      0.78      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.65      4583
weighted avg       0.67      0.67      0.67      4583

test score 0.672267073969016


#### Backwards

In [12]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train


X_test_back_sfs = test_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_test_back_sfs = y_test

In [13]:
# XG Boost classifier
model = xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, 
min_child_weight=1, n_estimators=100)
model.fit(X_train_back_sfs, y_train_back_sfs)


start_time = time.time()

preds = model.predict(X_test_back_sfs)
test_score = model.score(X_test_back_sfs, y_test_back_sfs)



In [14]:
target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_back_sfs, preds, target_names=target_names))

y_fit = model.predict(X_test_back_sfs)

print("test score", test_score)

Συνολικός χρόνος fit και predict: 9.542069911956787 seconds
              precision    recall  f1-score   support

   home loss       0.63      0.54      0.58      1935
    home win       0.70      0.77      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.66      4583
weighted avg       0.67      0.67      0.67      4583

test score 0.6724852716561205
