## Random Forest Classifier

### Simple RF Classifier with GridSearchCV

In [4]:
# Importing libraries
import time
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report


In [5]:
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [7]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

X1 = df.drop(["game_date_est", "season", "game_id", "home_team", "visitor_team", "home_team_id", "visitor_team_id",
              "home_team_wins", "conference", "conference_visitor"], axis=1)
y1 = df["home_team_wins"]

# Split our data
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4)

# Split Data to Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)


In [13]:
# Random Forest Classifier

model_first_split = RandomForestClassifier()
model_first_split.fit(X_train, y_train)

# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(model_first_split, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train, y_train)



GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [14]:
# Predict
preds = grid.predict(X_test)
test_score = grid.score(X_test, y_test)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_test)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 240.56514835357666 seconds
              precision    recall  f1-score   support

   home loss       0.85      0.74      0.79      2329
    home win       0.83      0.90      0.87      3233

    accuracy                           0.84      5562
   macro avg       0.84      0.82      0.83      5562
weighted avg       0.84      0.84      0.83      5562

{'bootstrap': True, 'max_depth': 11, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 100}
best score: 0.8493721678486601
test score 0.8365695792880259


### RF Classifier with SelectFromModel Function

In [12]:
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [13]:
# Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [14]:
# Create a selector object that will use the random forest classifier to identify
sfm = SelectFromModel(clf)

# Train the selector
sfm.fit(X_train, y_train)

sfm.get_support()


array([False,  True,  True,  True,  True, False,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False,  True, False,  True, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True,  True])

In [15]:
selected_feat = X_train.columns[(sfm.get_support())]
len(selected_feat)
print(selected_feat)

Index(['odds_home', 'odds_away', 'score_home', 'score_away', 'W_PCT_home',
       'ROAD_RECORD_home', 'eff_visitor', 'home_elo', 'elo_diff', 'eff_diff',
       'diff_curr_win_pct', 'diff_curr_home_record', 'diff_curr_away_record'],
      dtype='object')


In [22]:
X_sfm = df[['odds_home', 'odds_away', 'score_home', 'score_away', 'W_PCT_home',
       'HOME_RECORD_home', 'eff_visitor', 'home_elo', 'visitor_elo',
       'elo_diff', 'eff_diff', 'diff_win_pct_prev_season',
       'diff_home_record_last_season', 'diff_curr_win_pct',
       'diff_curr_home_record', 'diff_curr_away_record']]
y_sfm = df["home_team_wins"]

# Split our data
X_train_sfm, X_test_sfm, y_train_sfm, y_test_sfm = train_test_split(X_sfm, y_sfm, test_size=0.4)

# Split Data to Train and Validation
X_train_sfm, X_val_sfm, y_train_sfm, y_val_sfm = train_test_split(X_train_sfm, y_train_sfm, test_size=0.2, random_state=1)


In [23]:
# defining parameter range
param_grid = {'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}

grid = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
grid.fit(X_train_sfm, y_train_sfm)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100]},
             scoring='accuracy')

In [25]:
# Predict
preds = grid.predict(X_test_sfm)
test_score = grid.score(X_test_sfm, y_test_sfm)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_sfm, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_test_sfm)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 201.1356794834137 seconds
              precision    recall  f1-score   support

   home loss       0.97      0.95      0.96      2293
    home win       0.97      0.98      0.98      3269

    accuracy                           0.97      5562
   macro avg       0.97      0.97      0.97      5562
weighted avg       0.97      0.97      0.97      5562

{'bootstrap': True, 'max_depth': 11, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 100}
best score: 0.971523002543672
test score 0.970873786407767


### RF Classifier with RFE

In [8]:
estimator = RandomForestClassifier()
estimator.fit(X_train, y_train)

RandomForestClassifier()

In [9]:
X_rfe = df[['num_possible_outcomes', 'odds_home', 'odds_away', 'score_home',
       'score_away', 'G_home', 'W_PCT_home', 'HOME_RECORD_home',
       'ROAD_RECORD_home', 'G_away', 'W_PCT_away', 'ROAD_RECORD_away',
       'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
       'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'AST_home_3g',
       'REB_home_3g', 'WIN_PRCT_away_3g', 'PTS_away_3g', 'FG3_PCT_away_3g',
       'AST_home_7g', 'REB_home_7g', 'PTS_away_7g', 'AST_away_7g',
       'diff_avg_pts_home', 'diff_avg_pts_away', 'diff_avg_ast_home',
       'diff_avg_fg3_pct_home', 'diff_avg_reb_home', 'diff_avg_reb_away',
       'top_players', 'HG_7days', 'AG_7days', 'G_7days', 'HG_7days_VISITOR',
       'back2back_visitor', 'missing_players', 'missing_players_visitor',
       'top_player_diff', 'missing_player_diff', 'month',
       'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
       'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_FT_PCT_away',
       'Home_Last_5_Avg_REB_away', 'Home_Last_5_Avg_AST_away',
       'Away_Last_5_Avg_REB_home', 'Away_Last_5_Avg_REB_away',
       'Away_Last_5_Avg_AST_away', 'diff_pts_last_3_games',
       'diff_fg3_pct_last_3_games', 'diff_ft_pct_last_3_games',
       'diff_ast_last_3_games', 'diff_ast_last_7_games',
       'diff_reb_last_7_games', 'diff_win_pct_3_last_games',
       'diff_curr_win_pct', 'diff_curr_home_record']]

y_rfe = df["home_team_wins"]

# Split our data
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = train_test_split(X_rfe, y_rfe, test_size=0.4)

# Split Data to Train and Validation
X_train_rfe, X_val_rfe, y_train_rfe, y_val_rfe = train_test_split(X_train_rfe, y_train_rfe, test_size=0.2, random_state=1)


In [7]:
# # Random Forest Classifier
# pipe_model = Pipeline([
#   ('feature_selection', RFECV(RandomForestClassifier())),
#   ('classification', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8,
#                                             n_estimators=100))
# ])
# pipe_model.fit(X_train, y_train)

Pipeline(steps=[('feature_selection',
                 RFECV(estimator=RandomForestClassifier())),
                ('classification',
                 RandomForestClassifier(max_depth=11, min_samples_leaf=4,
                                        min_samples_split=8))])

In [10]:
# defining parameter range
param_grid = [{'bootstrap': [True],
              'max_depth': [8, 9, 10, 11],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100]}]           

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_rfe, y_train_rfe)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid=[{'bootstrap': [True], 'max_depth': [8, 9, 10, 11],
                          'max_features': ['auto', 'sqrt', 'log2'],
                          'min_samples_leaf': [3, 4, 5],
                          'min_samples_split': [8, 10, 12],
                          'n_estimators': [100]}],
             scoring='accuracy')

In [12]:
# Predict
preds = grid.predict(X_test_rfe)
test_score = grid.score(X_test_rfe, y_test_rfe)

target_names=['home loss', 'home win']

start_time = time.time()

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_rfe, preds, target_names=target_names))


model = grid.best_estimator_
y_fit = model.predict(X_test_rfe)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 0.0 seconds
              precision    recall  f1-score   support

   home loss       0.91      0.83      0.87      2313
    home win       0.88      0.94      0.91      3249

    accuracy                           0.89      5562
   macro avg       0.90      0.88      0.89      5562
weighted avg       0.90      0.89      0.89      5562

{'bootstrap': True, 'max_depth': 11, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 100}
best score: 0.8910407717489569
test score 0.8946422150305645


### RF Classifier with LassoCV

In [29]:
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline


In [31]:
pipe_model = Pipeline([
  ('feature_selection', SelectFromModel(LassoCV())),
  ('classification', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8,
                                            n_estimators=100))
])
pipe_model.fit(X_train, y_train)

Pipeline(steps=[('feature_selection', SelectFromModel(estimator=LassoCV())),
                ('classification',
                 RandomForestClassifier(max_depth=11, min_samples_leaf=4,
                                        min_samples_split=8))])

In [35]:
# Predict
preds = pipe_model.predict(X_test)
test_score = pipe_model.score(X_test, y_test)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))

print("test score", test_score)


Συνολικός χρόνος fit και predict: 1529.3399500846863 seconds
              precision    recall  f1-score   support

   home loss       0.99      0.97      0.98      2281
    home win       0.98      0.99      0.98      3281

    accuracy                           0.98      5562
   macro avg       0.98      0.98      0.98      5562
weighted avg       0.98      0.98      0.98      5562

test score 0.9814814814814815
