## Random Forest Classifier

In [78]:
# Importing libraries
import time
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

In [11]:
warnings.filterwarnings('ignore')

In [12]:
df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [13]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins



### Simple RF Classifier with GridSearchCV

In [9]:
# Random Forest Classifier

model_first_split = RandomForestClassifier()
model_first_split.fit(X_train, y_train)

# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(model_first_split, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train, y_train)



GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [14]:
# Predict on the validation set
preds = grid.predict(X_val)
test_score = grid.score(X_val, y_val)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_val)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 550.8153944015503 seconds
              precision    recall  f1-score   support

   home loss       0.67      0.54      0.60      1011
    home win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

{'bootstrap': True, 'max_depth': 9, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 100}
best score: 0.6931884057971015
test score 0.6991735537190082


#### Use Bagging Classifier

In [15]:
rf_model = RandomForestClassifier(n_estimators=100)

bag_model = BaggingClassifier(base_estimator=rf_model, random_state=42)
start_time = time.time()
bag_model.fit(X_train, y_train)
val_score = bag_model.score(X_val, y_val)

preds = bag_model.predict(X_val)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

Συνολικός χρόνος fit και predict: 33.00279450416565 seconds
              precision    recall  f1-score   support

   home_loss       0.68      0.54      0.60      1011
    home_win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

val score: 0.6991735537190082


### RF Classifier with SelectFromModel Function

In [16]:
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [17]:
# Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [18]:
# Create a selector object that will use the random forest classifier to identify
sfm = SelectFromModel(clf)

# Train the selector
sfm.fit(X_train, y_train)

sfm.get_support()


array([False,  True,  True, False,  True,  True, False, False,  True,
        True,  True, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False, False, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True, False, False, False,  True,  True, False, False,
       False, False, False, False,  True, False, False,  True,  True,
        True, False, False, False, False,  True,  True, False, False,
       False, False, False, False, False, False, False, False,  True,
        True,  True, False, False,  True, False, False,  True,  True,
        True, False, False, False, False,  True, False, False, False,
       False, False,  True,  True, False, False, False, False,  True,
       False,  True, False, False,  True, False, False,  True,  True,
        True, False, False, False, False,  True, False,  True, False,
        True,  True,  True,  True,  True])

In [19]:
selected_feat = X_train.columns[(sfm.get_support())]
len(selected_feat)
print(selected_feat)

Index(['odds_home', 'odds_away', 'W_PCT_home', 'HOME_RECORD_home',
       'W_PCT_away', 'HOME_RECORD_away', 'ROAD_RECORD_away', 'FT_PCT_home_3g',
       'FG3_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g',
       'FT_PCT_home_7g', 'REB_home_7g', 'FT_PCT_away_7g', 'FG3_PCT_away_7g',
       'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_ft_pct_home',
       'diff_avg_ft_pct_away', 'eff', 'eff_visitor', 'home_elo', 'visitor_elo',
       'elo_diff', 'eff_diff', 'Home_Last_5_Avg_FG_PCT_home',
       'Home_Last_5_Avg_FT_PCT_home', 'Home_Last_5_Avg_FG3_PCT_home',
       'Home_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_FT_PCT_home',
       'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
       'Away_Last_5_Avg_FG3_PCT_away', 'diff_fg_pct_last_3_games',
       'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
       'diff_ft_pct_last_7_games', 'diff_win_pct_prev_season',
       'diff_win_pct_7_last_games', 'diff_road_record_last_season',
       'diff_curr_win

In [20]:
X_sfm = train_data[['odds_home', 'odds_away', 'W_PCT_home', 'HOME_RECORD_home',
       'ROAD_RECORD_home', 'W_PCT_away', 'HOME_RECORD_away',
       'ROAD_RECORD_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g',
       'FT_PCT_away_3g', 'FT_PCT_home_7g', 'FT_PCT_away_7g',
       'diff_avg_fg3_pct_home', 'diff_avg_fg3_pct_away',
       'diff_avg_ft_pct_home', 'diff_avg_ft_pct_away', 'diff_avg_reb_away',
       'eff', 'eff_visitor', 'home_elo', 'visitor_elo', 'elo_diff', 'eff_diff',
       'Home_Last_5_Avg_FT_PCT_home', 'Home_Last_5_Avg_FG3_PCT_home',
       'Home_Last_5_Avg_FG_PCT_away', 'Home_Last_5_Avg_FT_PCT_away',
       'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_FG3_PCT_home',
       'Away_Last_5_Avg_FG_PCT_away', 'Away_Last_5_Avg_FT_PCT_away',
       'Away_Last_5_Avg_FG3_PCT_away', 'diff_fg_pct_last_7_games',
       'diff_ft_pct_last_3_games', 'diff_ft_pct_last_7_games',
       'diff_win_pct_prev_season', 'diff_home_record_last_season',
       'diff_road_record_last_season', 'diff_curr_win_pct',
       'diff_curr_home_record', 'diff_curr_away_record']]
y_sfm = y_train

X_val_sfm = valid_data[['odds_home', 'odds_away', 'W_PCT_home', 'HOME_RECORD_home',
       'ROAD_RECORD_home', 'W_PCT_away', 'HOME_RECORD_away',
       'ROAD_RECORD_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g',
       'FT_PCT_away_3g', 'FT_PCT_home_7g', 'FT_PCT_away_7g',
       'diff_avg_fg3_pct_home', 'diff_avg_fg3_pct_away',
       'diff_avg_ft_pct_home', 'diff_avg_ft_pct_away', 'diff_avg_reb_away',
       'eff', 'eff_visitor', 'home_elo', 'visitor_elo', 'elo_diff', 'eff_diff',
       'Home_Last_5_Avg_FT_PCT_home', 'Home_Last_5_Avg_FG3_PCT_home',
       'Home_Last_5_Avg_FG_PCT_away', 'Home_Last_5_Avg_FT_PCT_away',
       'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_FG3_PCT_home',
       'Away_Last_5_Avg_FG_PCT_away', 'Away_Last_5_Avg_FT_PCT_away',
       'Away_Last_5_Avg_FG3_PCT_away', 'diff_fg_pct_last_7_games',
       'diff_ft_pct_last_3_games', 'diff_ft_pct_last_7_games',
       'diff_win_pct_prev_season', 'diff_home_record_last_season',
       'diff_road_record_last_season', 'diff_curr_win_pct',
       'diff_curr_home_record', 'diff_curr_away_record']]

y_val_sfm = y_val

# Split our data
X_train_sfm, y_train_sfm = X_sfm, y_sfm

# Split Data to Train and Validation
# X_train_sfm, X_val_sfm, y_train_sfm, y_val_sfm = train_test_split(X_train_sfm, y_train_sfm, test_size=0.2, random_state=1)


In [21]:
# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train_sfm, y_train_sfm)

GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [22]:
# Predict
preds = grid.predict(X_val_sfm)
test_score = grid.score(X_val_sfm, y_val_sfm)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_sfm, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_val_sfm)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 445.25380182266235 seconds
              precision    recall  f1-score   support

   home loss       0.67      0.57      0.62      1011
    home win       0.72      0.80      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.69      2420
weighted avg       0.70      0.70      0.70      2420

{'bootstrap': True, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 100}
best score: 0.6956521739130435
test score 0.7024793388429752


### RF Classifier with RFECV

In [23]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_val_rcv = valid_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_val_rcv = y_val

X_test_rcv = test_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_test_rcv = y_test

In [24]:
estimator = RandomForestClassifier()
estimator.fit(X_train_rcv, y_train_rcv)

# defining parameter range
param_grid = [{'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}]           

grid = GridSearchCV(estimator, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_rcv, y_train_rcv)

GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid=[{'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                          'max_features': ['auto', 'sqrt', 'log2'],
                          'min_samples_leaf': [3, 4, 5],
                          'min_samples_split': [8, 10, 12],
                          'n_estimators': [100]}],
             scoring='accuracy')

In [25]:
# Predict
preds = grid.predict(X_val_rcv)
test_score = grid.score(X_val_rcv, y_val_rcv)

target_names=['home loss', 'home win']

start_time = time.time()

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_rcv, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_val_rcv)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 0.0 seconds
              precision    recall  f1-score   support

   home loss       0.65      0.58      0.61      1011
    home win       0.72      0.77      0.74      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420

{'bootstrap': True, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 100}
best score: 0.6988405797101449
test score 0.6913223140495868


### RF Classifier with RFE

In [26]:
estimator = RandomForestClassifier()
estimator.fit(X_train, y_train)

RandomForestClassifier()

In [27]:
X_train_rfe = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'W_PCT_home',
            'HOME_RECORD_home', 'ROAD_RECORD_home', 'ROAD_RECORD_away',
            'WIN_PRCT_home_3g', 'PTS_home_3g', 'REB_home_3g', 'WIN_PRCT_away_3g',
            'AST_away_3g', 'REB_away_3g', 'PTS_home_7g', 'AST_home_7g',
            'REB_home_7g', 'AST_away_7g', 'diff_avg_pts_home', 'diff_avg_ast_home',
            'diff_avg_ast_away', 'diff_avg_fg3_pct_home', 'diff_avg_reb_home',
            'diff_avg_reb_away', 'top_players', 'eff_visitor', 'HG_7days',
            'AG_7days', 'G_7days', 'back2back', 'HG_7days_VISITOR',
            'G_7days_VISITOR', 'back2back_visitor', 'missing_players',
            'missing_players_visitor', 'home_elo', 'visitor_elo', 'elo_diff',
            'top_player_diff', 'missing_player_diff', 'eff_diff', 'month',
            'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
            'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_REB_away',
            'Home_Last_5_Avg_AST_away', 'Away_Last_5_Avg_FG3_PCT_home',
            'Away_Last_5_Avg_AST_home', 'Away_Last_5_Avg_PTS_away',
            'Away_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_AST_away',
            'diff_ast_last_3_games', 'diff_ast_last_7_games',
            'diff_reb_last_3_games', 'diff_reb_last_7_games',
            'diff_win_pct_3_last_games', 'diff_curr_win_pct',
            'diff_curr_home_record', 'diff_curr_away_record']]

y_train_rfe = y_train

X_val_rfe = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'W_PCT_home',
            'HOME_RECORD_home', 'ROAD_RECORD_home', 'ROAD_RECORD_away',
            'WIN_PRCT_home_3g', 'PTS_home_3g', 'REB_home_3g', 'WIN_PRCT_away_3g',
            'AST_away_3g', 'REB_away_3g', 'PTS_home_7g', 'AST_home_7g',
            'REB_home_7g', 'AST_away_7g', 'diff_avg_pts_home', 'diff_avg_ast_home',
            'diff_avg_ast_away', 'diff_avg_fg3_pct_home', 'diff_avg_reb_home',
            'diff_avg_reb_away', 'top_players', 'eff_visitor', 'HG_7days',
            'AG_7days', 'G_7days', 'back2back', 'HG_7days_VISITOR',
            'G_7days_VISITOR', 'back2back_visitor', 'missing_players',
            'missing_players_visitor', 'home_elo', 'visitor_elo', 'elo_diff',
            'top_player_diff', 'missing_player_diff', 'eff_diff', 'month',
            'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
            'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_REB_away',
            'Home_Last_5_Avg_AST_away', 'Away_Last_5_Avg_FG3_PCT_home',
            'Away_Last_5_Avg_AST_home', 'Away_Last_5_Avg_PTS_away',
            'Away_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_AST_away',
            'diff_ast_last_3_games', 'diff_ast_last_7_games',
            'diff_reb_last_3_games', 'diff_reb_last_7_games',
            'diff_win_pct_3_last_games', 'diff_curr_win_pct',
            'diff_curr_home_record', 'diff_curr_away_record']]

y_val_rfe = y_val



In [28]:
# # Random Forest Classifier
# pipe_model = Pipeline([
#   ('feature_selection', RFECV(RandomForestClassifier())),
#   ('classification', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8,
#                                             n_estimators=100))
# ])
# pipe_model.fit(X_train, y_train)

In [29]:
# defining parameter range
param_grid = [{'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}]           

grid = GridSearchCV(estimator, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_rfe, y_train_rfe)

GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid=[{'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                          'max_features': ['auto', 'sqrt', 'log2'],
                          'min_samples_leaf': [3, 4, 5],
                          'min_samples_split': [8, 10, 12],
                          'n_estimators': [100]}],
             scoring='accuracy')

In [30]:
# Predict
preds = grid.predict(X_val_rfe)
test_score = grid.score(X_val_rfe, y_val_rfe)

target_names=['home loss', 'home win']

start_time = time.time()

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_rfe, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_val_rfe)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 0.0 seconds
              precision    recall  f1-score   support

   home loss       0.67      0.56      0.61      1011
    home win       0.72      0.80      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.70      2420

{'bootstrap': True, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 100}
best score: 0.6971014492753623
test score 0.7


### RF Classifier with LassoCV

In [31]:
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline

In [32]:
pipe_model = Pipeline([
  ('feature_selection', SelectFromModel(LassoCV())),
  ('pca', PCA()),
  ('classification', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8,
                                            n_estimators=100))
])
pipe_model.fit(X_train, y_train)

Pipeline(steps=[('feature_selection', SelectFromModel(estimator=LassoCV())),
                ('classification',
                 RandomForestClassifier(max_depth=11, min_samples_leaf=4,
                                        min_samples_split=8))])

In [33]:
# Predict
preds = pipe_model.predict(X_val)
test_score = pipe_model.score(X_val, y_val)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))

print("test score", test_score)


Συνολικός χρόνος fit και predict: 2.623943567276001 seconds
              precision    recall  f1-score   support

   home loss       0.66      0.52      0.58      1011
    home win       0.70      0.81      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.66      0.67      2420
weighted avg       0.68      0.69      0.68      2420

test score 0.6867768595041323


### RF with ExtraTreesClassifier

In [34]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_train_extra = y_train

X_val_extra = valid_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_val_extra = y_val

In [35]:
# Random Forest Classifier

model_first_split = RandomForestClassifier()
model_first_split.fit(X_train_extra, y_train_extra)

# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(model_first_split, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train_extra, y_train_extra)



GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [36]:
# Predict on the validation set
preds = grid.predict(X_val_extra)
test_score = grid.score(X_val_extra, y_val_extra)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_extra, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_val_extra)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 311.9265122413635 seconds
              precision    recall  f1-score   support

   home loss       0.66      0.57      0.61      1011
    home win       0.72      0.79      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420

{'bootstrap': True, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 100}
best score: 0.6933333333333332
test score 0.6958677685950413


### RF Classifier with PCA

In [37]:
from sklearn.decomposition import PCA

In [88]:
# Choose number of components = 30 based on a feature selection coding process we already executed
n_components = 7

pca = PCA(n_components=n_components).fit(X_train)
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

In [89]:
# Random Forest Classifier

model_first_split = RandomForestClassifier()
model_first_split.fit(X_train_pca, y_train)

# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(model_first_split, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train_pca, y_train)



GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [90]:
# Predict on the validation set
preds = grid.predict(X_val_pca)
test_score = grid.score(X_val_pca, y_val)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_val_pca)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 311.37569785118103 seconds
              precision    recall  f1-score   support

   home loss       0.65      0.53      0.58      1011
    home win       0.70      0.80      0.75      1409

    accuracy                           0.68      2420
   macro avg       0.68      0.66      0.66      2420
weighted avg       0.68      0.68      0.68      2420

{'bootstrap': True, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 100}
best score: 0.6828985507246376
test score 0.6847107438016529


### RF Classifier with Univariate Feature Selection

In [41]:
X_train_uni = train_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_train_uni = y_train

X_val_uni = valid_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_val_uni = y_val

In [42]:
# Random Forest Classifier

model_first_split = RandomForestClassifier()
model_first_split.fit(X_train_uni, y_train_uni)

# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(model_first_split, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train_uni, y_train_uni)



GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [43]:
# Predict on the validation set
preds = grid.predict(X_val_uni)
test_score = grid.score(X_val_uni, y_val_uni)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_uni, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_val_uni)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 211.1590292453766 seconds
              precision    recall  f1-score   support

   home loss       0.65      0.58      0.61      1011
    home win       0.72      0.77      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420

{'bootstrap': True, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
best score: 0.6959420289855072
test score 0.6929752066115702


### RF Classifier with SFS

#### Forward

In [44]:
X_train_for_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_train_for_sfs = y_train

X_val_for_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_val_for_sfs = y_val


In [45]:
# Random Forest Classifier

model_first_split = RandomForestClassifier()
model_first_split.fit(X_train_for_sfs, y_train_for_sfs)

# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(model_first_split, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train_for_sfs, y_train_for_sfs)



GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [46]:
# Predict on the validation set
preds = grid.predict(X_val_for_sfs)
test_score = grid.score(X_val_for_sfs, y_val_for_sfs)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_for_sfs, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_val_for_sfs)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 404.76044750213623 seconds
              precision    recall  f1-score   support

   home loss       0.66      0.58      0.61      1011
    home win       0.72      0.79      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.70      2420

{'bootstrap': True, 'max_depth': 11, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
best score: 0.698695652173913
test score 0.6987603305785124


#### Backwards

In [47]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train

X_val_back_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_val_back_sfs = y_val

In [48]:
# Random Forest Classifier

model_first_split = RandomForestClassifier()
model_first_split.fit(X_train_back_sfs, y_train_back_sfs)

# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(model_first_split, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train_back_sfs, y_train_back_sfs)



GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [49]:
# Predict on the validation set
preds = grid.predict(X_val_back_sfs)
test_score = grid.score(X_val_back_sfs, y_val_back_sfs)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_back_sfs, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_val_back_sfs)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 407.9315481185913 seconds
              precision    recall  f1-score   support

   home loss       0.66      0.58      0.62      1011
    home win       0.72      0.78      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

{'bootstrap': True, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
best score: 0.6965217391304348
test score 0.6983471074380165


### Make some combinations

In [82]:
pca = PCA().fit(X_train_)
X_train_pca = pca.transform(X_train_)
X_val_pca = pca.transform(X_val_)
# X_test_pca = pca.transform(X_test_for_sfs)

In [83]:
# Random Forest Classifier

model_first_split = RandomForestClassifier()
model_first_split.fit(X_train_pca, y_train)

# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(model_first_split, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train_pca, y_train)



GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [84]:
# Predict on the validation set
preds = grid.predict(X_val_pca)
test_score = grid.score(X_val_pca, y_val)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_val_pca)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 524.8731763362885 seconds
              precision    recall  f1-score   support

   home loss       0.66      0.51      0.58      1011
    home win       0.70      0.82      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.66      0.66      2420
weighted avg       0.68      0.69      0.68      2420

{'bootstrap': True, 'max_depth': 9, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 12, 'n_estimators': 100}
best score: 0.6847826086956522
test score 0.687603305785124
