## Random Forest Classifier

In [1]:
# Importing libraries
import time
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [4]:
train_data = df.loc[(df.season < 2016) & (df.season >= 2007)]
test_data = df.loc[df.season >= 2016]

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


In [5]:
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("Training Results: \n===============================")
    clf_report = classification_report(y_train, y_train_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

    print("Testing Results: \n===============================")
    clf_report = classification_report(y_test, y_test_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

### Simple RF Classifier with GridSearchCV

In [6]:
# Random Forest Classifier

model = RandomForestClassifier(bootstrap=True, max_depth=9, max_features='log2', 
min_samples_leaf=4, min_samples_split=8, n_estimators=100)
model.fit(X_train, y_train)
start_time = time.time()


In [7]:
# Predict on the validation set
preds = model.predict(X_test)
test_score = model.score(X_test, y_test)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))

y_fit = model.predict(X_test)

print("test score", test_score)


Συνολικός χρόνος fit και predict: 4.882997751235962 seconds
              precision    recall  f1-score   support

   home loss       0.65      0.50      0.56      1935
    home win       0.69      0.80      0.74      2648

    accuracy                           0.67      4583
   macro avg       0.67      0.65      0.65      4583
weighted avg       0.67      0.67      0.66      4583

test score 0.673139864717434


### RF Classifier with SelectFromModel Function

In [7]:
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [8]:
# Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [16]:
X_sfm = train_data[['odds_home', 'odds_away', 'W_PCT_home', 'HOME_RECORD_home',
       'ROAD_RECORD_home', 'W_PCT_away', 'HOME_RECORD_away',
       'ROAD_RECORD_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g',
       'FT_PCT_away_3g', 'FT_PCT_home_7g', 'FT_PCT_away_7g',
       'diff_avg_fg3_pct_home', 'diff_avg_fg3_pct_away',
       'diff_avg_ft_pct_home', 'diff_avg_ft_pct_away', 'diff_avg_reb_away',
       'eff', 'eff_visitor', 'home_elo', 'visitor_elo', 'elo_diff', 'eff_diff',
       'Home_Last_5_Avg_FT_PCT_home', 'Home_Last_5_Avg_FG3_PCT_home',
       'Home_Last_5_Avg_FG_PCT_away', 'Home_Last_5_Avg_FT_PCT_away',
       'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_FG3_PCT_home',
       'Away_Last_5_Avg_FG_PCT_away', 'Away_Last_5_Avg_FT_PCT_away',
       'Away_Last_5_Avg_FG3_PCT_away', 'diff_fg_pct_last_7_games',
       'diff_ft_pct_last_3_games', 'diff_ft_pct_last_7_games',
       'diff_win_pct_prev_season', 'diff_home_record_last_season',
       'diff_road_record_last_season', 'diff_curr_win_pct',
       'diff_curr_home_record', 'diff_curr_away_record']]
y_sfm = y_train


X_test_sfm = test_data[['odds_home', 'odds_away', 'W_PCT_home', 'HOME_RECORD_home',
       'ROAD_RECORD_home', 'W_PCT_away', 'HOME_RECORD_away',
       'ROAD_RECORD_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g',
       'FT_PCT_away_3g', 'FT_PCT_home_7g', 'FT_PCT_away_7g',
       'diff_avg_fg3_pct_home', 'diff_avg_fg3_pct_away',
       'diff_avg_ft_pct_home', 'diff_avg_ft_pct_away', 'diff_avg_reb_away',
       'eff', 'eff_visitor', 'home_elo', 'visitor_elo', 'elo_diff', 'eff_diff',
       'Home_Last_5_Avg_FT_PCT_home', 'Home_Last_5_Avg_FG3_PCT_home',
       'Home_Last_5_Avg_FG_PCT_away', 'Home_Last_5_Avg_FT_PCT_away',
       'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_FG3_PCT_home',
       'Away_Last_5_Avg_FG_PCT_away', 'Away_Last_5_Avg_FT_PCT_away',
       'Away_Last_5_Avg_FG3_PCT_away', 'diff_fg_pct_last_7_games',
       'diff_ft_pct_last_3_games', 'diff_ft_pct_last_7_games',
       'diff_win_pct_prev_season', 'diff_home_record_last_season',
       'diff_road_record_last_season', 'diff_curr_win_pct',
       'diff_curr_home_record', 'diff_curr_away_record']]
y_test_sfm = y_test

# Split our data
X_train_sfm, y_train_sfm = X_sfm, y_sfm


In [14]:
# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(clf, param_grid, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train_sfm, y_train_sfm)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [17]:
# Predict
preds = grid.predict(X_test_sfm)
test_score = grid.score(X_test_sfm, y_test_sfm)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_sfm, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_test_sfm)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 367.466255903244 seconds
              precision    recall  f1-score   support

   home loss       0.63      0.54      0.58      1935
    home win       0.70      0.77      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.67      0.67      4583

{'bootstrap': True, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 100}
best score: 0.6947826086956521
test score 0.6740126554658521


### RF Classifier with RFE

In [18]:
estimator = RandomForestClassifier()
estimator.fit(X_train, y_train)

RandomForestClassifier()

In [19]:
X_train_rfe = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'W_PCT_home',
            'HOME_RECORD_home', 'ROAD_RECORD_home', 'ROAD_RECORD_away',
            'WIN_PRCT_home_3g', 'PTS_home_3g', 'REB_home_3g', 'WIN_PRCT_away_3g',
            'AST_away_3g', 'REB_away_3g', 'PTS_home_7g', 'AST_home_7g',
            'REB_home_7g', 'AST_away_7g', 'diff_avg_pts_home', 'diff_avg_ast_home',
            'diff_avg_ast_away', 'diff_avg_fg3_pct_home', 'diff_avg_reb_home',
            'diff_avg_reb_away', 'top_players', 'eff_visitor', 'HG_7days',
            'AG_7days', 'G_7days', 'back2back', 'HG_7days_VISITOR',
            'G_7days_VISITOR', 'back2back_visitor', 'missing_players',
            'missing_players_visitor', 'home_elo', 'visitor_elo', 'elo_diff',
            'top_player_diff', 'missing_player_diff', 'eff_diff', 'month',
            'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
            'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_REB_away',
            'Home_Last_5_Avg_AST_away', 'Away_Last_5_Avg_FG3_PCT_home',
            'Away_Last_5_Avg_AST_home', 'Away_Last_5_Avg_PTS_away',
            'Away_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_AST_away',
            'diff_ast_last_3_games', 'diff_ast_last_7_games',
            'diff_reb_last_3_games', 'diff_reb_last_7_games',
            'diff_win_pct_3_last_games', 'diff_curr_win_pct',
            'diff_curr_home_record', 'diff_curr_away_record']]

y_train_rfe = y_train



X_test_rfe = test_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'W_PCT_home',
            'HOME_RECORD_home', 'ROAD_RECORD_home', 'ROAD_RECORD_away',
            'WIN_PRCT_home_3g', 'PTS_home_3g', 'REB_home_3g', 'WIN_PRCT_away_3g',
            'AST_away_3g', 'REB_away_3g', 'PTS_home_7g', 'AST_home_7g',
            'REB_home_7g', 'AST_away_7g', 'diff_avg_pts_home', 'diff_avg_ast_home',
            'diff_avg_ast_away', 'diff_avg_fg3_pct_home', 'diff_avg_reb_home',
            'diff_avg_reb_away', 'top_players', 'eff_visitor', 'HG_7days',
            'AG_7days', 'G_7days', 'back2back', 'HG_7days_VISITOR',
            'G_7days_VISITOR', 'back2back_visitor', 'missing_players',
            'missing_players_visitor', 'home_elo', 'visitor_elo', 'elo_diff',
            'top_player_diff', 'missing_player_diff', 'eff_diff', 'month',
            'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
            'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_REB_away',
            'Home_Last_5_Avg_AST_away', 'Away_Last_5_Avg_FG3_PCT_home',
            'Away_Last_5_Avg_AST_home', 'Away_Last_5_Avg_PTS_away',
            'Away_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_AST_away',
            'diff_ast_last_3_games', 'diff_ast_last_7_games',
            'diff_reb_last_3_games', 'diff_reb_last_7_games',
            'diff_win_pct_3_last_games', 'diff_curr_win_pct',
            'diff_curr_home_record', 'diff_curr_away_record']]

y_test_rfe = y_test


In [20]:
# defining parameter range
param_grid = [{'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}]           

grid = GridSearchCV(estimator, param_grid, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_rfe, y_train_rfe)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid=[{'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                          'max_features': ['auto', 'sqrt', 'log2'],
                          'min_samples_leaf': [3, 4, 5],
                          'min_samples_split': [8, 10, 12],
                          'n_estimators': [100]}],
             scoring='accuracy')

In [21]:
# Predict
preds = grid.predict(X_test_rfe)
test_score = grid.score(X_test_rfe, y_test_rfe)

target_names=['home loss', 'home win']

start_time = time.time()

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_rfe, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_test_rfe)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 0.0 seconds
              precision    recall  f1-score   support

   home loss       0.62      0.54      0.58      1935
    home win       0.69      0.76      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.65      4583
weighted avg       0.67      0.67      0.66      4583

{'bootstrap': True, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100}
best score: 0.695072463768116
test score 0.6689941086624481


### RF with RFECV

In [6]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_test_rcv = test_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_test_rcv = y_test

In [7]:
estimator = RandomForestClassifier(bootstrap=True, max_depth=8, max_features='auto', 
min_samples_leaf=4, min_samples_split=8, n_estimators=100)
estimator.fit(X_train_rcv, y_train_rcv)

RandomForestClassifier(max_depth=8, min_samples_leaf=4, min_samples_split=8)

In [9]:
# Predict
preds = estimator.predict(X_test_rcv)
test_score = estimator.score(X_test_rcv, y_test_rcv)

target_names=['home loss', 'home win']

start_time = time.time()

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_rcv, preds, target_names=target_names))

print("test score", test_score)

Συνολικός χρόνος fit και predict: 0.0 seconds
              precision    recall  f1-score   support

   home loss       0.63      0.55      0.59      1935
    home win       0.70      0.77      0.73      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.68      0.67      4583

test score 0.6755400392755837


In [8]:
evaluate(estimator, X_train_rcv, X_test_rcv, y_train_rcv, y_test_rcv)

Training Results: 
Confusion Matrix:
[[2228 1567]
 [ 969 4556]]
Accuracy Score:
0.7279
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.59      0.64      3795
           1       0.74      0.82      0.78      5525

    accuracy                           0.73      9320
   macro avg       0.72      0.71      0.71      9320
weighted avg       0.72      0.73      0.72      9320

Testing Results: 
Confusion Matrix:
[[1056  879]
 [ 608 2040]]
Accuracy Score:
0.6755
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.55      0.59      1935
           1       0.70      0.77      0.73      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.68      0.67      4583



### RF with ExtraTreesClassifier

In [15]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_train_extra = y_train

X_test_extra = test_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_test_extra = y_test


In [16]:
# Random Forest Classifier

model_first_split = RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', 
min_samples_leaf=3, min_samples_split=12, n_estimators=100)
model.fit(X_train_extra, y_train_extra)

start_time = time.time()
model.fit(X_train_extra, y_train_extra)



RandomForestClassifier(max_depth=9, max_features='log2', min_samples_leaf=4,
                       min_samples_split=8)

In [17]:
# Predict on the validation set
preds = model.predict(X_test_extra)
test_score = model.score(X_test_extra, y_test_extra)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_extra, preds, target_names=target_names))

print("test score", test_score)


Συνολικός χρόνος fit και predict: 23.44558358192444 seconds
              precision    recall  f1-score   support

   home loss       0.63      0.54      0.58      1935
    home win       0.69      0.77      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.65      4583
weighted avg       0.67      0.67      0.67      4583

test score 0.6703032947850753
