## XGBoost Classifier

In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings

In [2]:
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [4]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins



### XGB Classifier with GridSearchCV

In [7]:
# XG Boost classifier
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# defining parameter range
param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.3],
        'n_estimators': [100, 200, 300],
        'min_child_weight': [1, 3],
        'gamma': [x/10 for x in range(0, 5)]
        }
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train, y_train)
val_score = grid.score(X_val, y_val)

preds = grid.predict(X_val)
test_score = grid.score(X_val, y_val)



In [8]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 18942.409309387207 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.56      0.60      1011
    home_win       0.71      0.79      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.67      0.68      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.6917355371900826
{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100}
best score: 0.6863768115942028
test score 0.6917355371900826


### XGB Classifier with RFE

In [1]:
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline


In [6]:
X_train_rfe = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'W_PCT_home',
            'HOME_RECORD_home', 'ROAD_RECORD_home', 'ROAD_RECORD_away',
            'WIN_PRCT_home_3g', 'PTS_home_3g', 'REB_home_3g', 'WIN_PRCT_away_3g',
            'AST_away_3g', 'REB_away_3g', 'PTS_home_7g', 'AST_home_7g',
            'REB_home_7g', 'AST_away_7g', 'diff_avg_pts_home', 'diff_avg_ast_home',
            'diff_avg_ast_away', 'diff_avg_fg3_pct_home', 'diff_avg_reb_home',
            'diff_avg_reb_away', 'top_players', 'eff_visitor', 'HG_7days',
            'AG_7days', 'G_7days', 'back2back', 'HG_7days_VISITOR',
            'G_7days_VISITOR', 'back2back_visitor', 'missing_players',
            'missing_players_visitor', 'home_elo', 'visitor_elo', 'elo_diff',
            'top_player_diff', 'missing_player_diff', 'eff_diff', 'month',
            'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
            'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_REB_away',
            'Home_Last_5_Avg_AST_away', 'Away_Last_5_Avg_FG3_PCT_home',
            'Away_Last_5_Avg_AST_home', 'Away_Last_5_Avg_PTS_away',
            'Away_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_AST_away',
            'diff_ast_last_3_games', 'diff_ast_last_7_games',
            'diff_reb_last_3_games', 'diff_reb_last_7_games',
            'diff_win_pct_3_last_games', 'diff_curr_win_pct',
            'diff_curr_home_record', 'diff_curr_away_record']]

y_train_rfe = y_train

X_val_rfe = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'W_PCT_home',
            'HOME_RECORD_home', 'ROAD_RECORD_home', 'ROAD_RECORD_away',
            'WIN_PRCT_home_3g', 'PTS_home_3g', 'REB_home_3g', 'WIN_PRCT_away_3g',
            'AST_away_3g', 'REB_away_3g', 'PTS_home_7g', 'AST_home_7g',
            'REB_home_7g', 'AST_away_7g', 'diff_avg_pts_home', 'diff_avg_ast_home',
            'diff_avg_ast_away', 'diff_avg_fg3_pct_home', 'diff_avg_reb_home',
            'diff_avg_reb_away', 'top_players', 'eff_visitor', 'HG_7days',
            'AG_7days', 'G_7days', 'back2back', 'HG_7days_VISITOR',
            'G_7days_VISITOR', 'back2back_visitor', 'missing_players',
            'missing_players_visitor', 'home_elo', 'visitor_elo', 'elo_diff',
            'top_player_diff', 'missing_player_diff', 'eff_diff', 'month',
            'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
            'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_REB_away',
            'Home_Last_5_Avg_AST_away', 'Away_Last_5_Avg_FG3_PCT_home',
            'Away_Last_5_Avg_AST_home', 'Away_Last_5_Avg_PTS_away',
            'Away_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_AST_away',
            'diff_ast_last_3_games', 'diff_ast_last_7_games',
            'diff_reb_last_3_games', 'diff_reb_last_7_games',
            'diff_win_pct_3_last_games', 'diff_curr_win_pct',
            'diff_curr_home_record', 'diff_curr_away_record']]

y_val_rfe = y_val

In [7]:
# XG Boost classifier
model = xgb.XGBClassifier()
model.fit(X_train_rfe, y_train_rfe)

# defining parameter range
param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.3],
        'n_estimators': [100, 200, 300],
        'min_child_weight': [1, 3],
        'gamma': [x/10 for x in range(0, 5)]
        }
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train_rfe, y_train_rfe)
val_score = grid.score(X_val_rfe, y_val_rfe)

preds = grid.predict(X_val_rfe)
test_score = grid.score(X_val_rfe, y_val_rfe)



In [8]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_rfe, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_rfe)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 1681.9006819725037 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.56      0.60      1011
    home_win       0.71      0.78      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.67      0.67      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.6900826446280992
{'gamma': 0.4, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100}
best score: 0.6905797101449276
test score 0.6900826446280992


### XGB with RFECV

In [9]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_val_rcv = valid_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_val_rcv = y_val

In [10]:
# XG Boost classifier
model = xgb.XGBClassifier()
model.fit(X_train_rcv, y_train_rcv)

# defining parameter range
param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.3],
        'n_estimators': [100, 200, 300],
        'min_child_weight': [1, 3],
        'gamma': [x/10 for x in range(0, 5)]
        }
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train_rcv, y_train_rcv)
val_score = grid.score(X_val_rcv, y_val_rcv)

preds = grid.predict(X_val_rcv)
test_score = grid.score(X_val_rcv, y_val_rcv)



In [11]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_rcv, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_rcv)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 171.41004133224487 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.57      0.61      1011
    home_win       0.72      0.78      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.67      0.68      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.6913223140495868
{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100}
best score: 0.6960869565217391
test score 0.6913223140495868


### XGB with Univariate Feature Selection

In [12]:
X_train_uni = train_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_train_uni = y_train

X_val_uni = valid_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_val_uni = y_val

In [13]:
# XG Boost classifier
model = xgb.XGBClassifier()
model.fit(X_train_uni, y_train_uni)

# defining parameter range
param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.3],
        'n_estimators': [100, 200, 300],
        'min_child_weight': [1, 3],
        'gamma': [x/10 for x in range(0, 5)]
        }
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train_uni, y_train_uni)
val_score = grid.score(X_val_uni, y_val_uni)

preds = grid.predict(X_val_uni)
test_score = grid.score(X_val_uni, y_val_uni)



In [14]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_uni, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_uni)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 265.21026492118835 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.58      0.61      1011
    home_win       0.72      0.77      0.74      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.6917355371900826
{'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100}
best score: 0.6978260869565218
test score 0.6917355371900826


### XGB with Extra Trees Classifier

In [15]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_train_extra = y_train

X_val_extra = valid_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_val_extra = y_val

In [16]:
# XG Boost classifier
model = xgb.XGBClassifier()
model.fit(X_train_extra, y_train_extra)

# defining parameter range
param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.3],
        'n_estimators': [100, 200, 300],
        'min_child_weight': [1, 3],
        'gamma': [x/10 for x in range(0, 5)]
        }
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train_extra, y_train_extra)
val_score = grid.score(X_val_extra, y_val_extra)

preds = grid.predict(X_val_extra)
test_score = grid.score(X_val_extra, y_val_extra)



In [18]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_extra, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_extra)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 571.7195553779602 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.56      0.60      1011
    home_win       0.71      0.78      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.67      0.67      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.6896694214876034
{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100}
best score: 0.6904347826086956
test score 0.6896694214876034


### XGB with PCA

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [19]:
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_val_standard = scaler.transform(X_val)
X_test_standard = scaler.transform(X_test)

In [20]:
n_components = 30
pca = PCA(n_components=n_components).fit(X_train)
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

In [21]:
# XG Boost classifier
model = xgb.XGBClassifier()
model.fit(X_train_pca, y_train)

# defining parameter range
param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.3],
        'n_estimators': [100, 200, 300],
        'min_child_weight': [1, 3],
        'gamma': [x/10 for x in range(0, 5)]
        }
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train_pca, y_train)
val_score = grid.score(X_val_pca, y_val)

preds = grid.predict(X_val_pca)
test_score = grid.score(X_val_pca, y_val)



In [22]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_pca)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 1514.1459152698517 seconds
              precision    recall  f1-score   support

   home_loss       0.67      0.53      0.59      1011
    home_win       0.71      0.81      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.69      0.67      0.67      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.6925619834710743
{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100}
best score: 0.6814492753623188
test score 0.6925619834710743


### XGB with SelectFromModel(LassoCV)

In [5]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline

In [8]:
pipe_model = Pipeline([
  ('feature_selection', SelectFromModel(LassoCV())),
  ('classification', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100))
])
pipe_model.fit(X_train, y_train)

start_time = time.time()


In [9]:
# Predict
preds = pipe_model.predict(X_val)
test_score = pipe_model.score(X_val, y_val)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))

print("test score", test_score)


Συνολικός χρόνος fit και predict: 1.3265442848205566 seconds
              precision    recall  f1-score   support

   home loss       0.67      0.53      0.59      1011
    home win       0.71      0.81      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.69      0.67      0.67      2420
weighted avg       0.69      0.69      0.69      2420

test score 0.6921487603305785


#### Use lasso_data

In [10]:
X_train_lasso = train_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train

X_val_lasso = valid_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_val_lasso = y_val

In [11]:
# XG Boost classifier
model = xgb.XGBClassifier()
model.fit(X_train_lasso, y_train_lasso)

# defining parameter range
param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.3],
        'n_estimators': [100, 200, 300],
        'min_child_weight': [1, 3],
        'gamma': [x/10 for x in range(0, 5)]
        }
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train_lasso, y_train_lasso)
val_score = grid.score(X_val_lasso, y_val_lasso)

preds = grid.predict(X_val_lasso)
test_score = grid.score(X_val_lasso, y_val_lasso)



In [12]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_lasso, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_lasso)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 408.21502900123596 seconds
              precision    recall  f1-score   support

   home_loss       0.66      0.58      0.62      1011
    home_win       0.72      0.78      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420

val score: 0.6979338842975207
{'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100}
best score: 0.6914492753623189
test score 0.6979338842975207


### XGB with Sequential Feature Selection

#### Forward

In [13]:
X_train_for_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_train_for_sfs = y_train

X_val_for_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_val_for_sfs = y_val


In [14]:
# XG Boost classifier
model = xgb.XGBClassifier()
model.fit(X_train_for_sfs, y_train_for_sfs)

# defining parameter range
param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.3],
        'n_estimators': [100, 200, 300],
        'min_child_weight': [1, 3],
        'gamma': [x/10 for x in range(0, 5)]
        }
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train_for_sfs, y_train_for_sfs)
val_score = grid.score(X_val_for_sfs, y_val_for_sfs)

preds = grid.predict(X_val_for_sfs)
test_score = grid.score(X_val_for_sfs, y_val_for_sfs)



In [15]:
target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_for_sfs, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_for_sfs)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 1268.6099355220795 seconds
              precision    recall  f1-score   support

   home loss       0.66      0.58      0.62      1011
    home win       0.72      0.78      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420

val score: 0.6979338842975207
{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100}
best score: 0.691159420289855
test score 0.6979338842975207


#### Backwards

In [16]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train

X_val_back_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_val_back_sfs = y_val

In [17]:
# XG Boost classifier
model = xgb.XGBClassifier()
model.fit(X_train_back_sfs, y_train_back_sfs)

# defining parameter range
param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.3],
        'n_estimators': [100, 200, 300],
        'min_child_weight': [1, 3],
        'gamma': [x/10 for x in range(0, 5)]
        }
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

start_time = time.time()
grid.fit(X_train_back_sfs, y_train_back_sfs)
val_score = grid.score(X_val_back_sfs, y_val_back_sfs)

preds = grid.predict(X_val_back_sfs)
test_score = grid.score(X_val_back_sfs, y_val_back_sfs)



In [18]:
target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_back_sfs, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_back_sfs)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 995.6403968334198 seconds
              precision    recall  f1-score   support

   home loss       0.66      0.58      0.62      1011
    home win       0.72      0.78      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420

val score: 0.6979338842975207
{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100}
best score: 0.691159420289855
test score 0.6979338842975207
