## MLP with GridSearchCV

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

import time

import warnings
warnings.filterwarnings('ignore')


In [4]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins



In [7]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [9]:
start_time = time.time()
grid.fit(X_train, y_train)
val_score = grid.score(X_val, y_val)

preds = grid.predict(X_val)
test_score = grid.score(X_val, y_val)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 25.180410623550415 seconds
              precision    recall  f1-score   support

   home_loss       0.68      0.52      0.59      1011
    home_win       0.71      0.83      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.67      0.68      2420
weighted avg       0.70      0.70      0.69      2420

val score: 0.6983471074380165
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 150, 'solver': 'lbfgs'}
best score: 0.666231884057971
test score 0.6983471074380165


### MLP Classifier with Univariate Feature Selection

In [10]:
X_train_uni = train_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_train_uni = y_train

X_val_uni = valid_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_val_uni = y_val

In [11]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_uni, y_train_uni)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [12]:
start_time = time.time()
grid.fit(X_train_uni, y_train_uni)
val_score = grid.score(X_val_uni, y_val_uni)

preds = grid.predict(X_val_uni)
test_score = grid.score(X_val_uni, y_val_uni)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_uni, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_uni)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 23.472278118133545 seconds
              precision    recall  f1-score   support

   home_loss       0.68      0.54      0.60      1011
    home_win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.70      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

val score: 0.7012396694214876
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 250, 'solver': 'lbfgs'}
best score: 0.6920289855072463
test score 0.7012396694214876


### MLP Classifier with SelectFromModel(LassoCV)

In [33]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline

In [35]:
pipe_model = Pipeline([
  ('feature_selection', SelectFromModel(LassoCV())),
  ('classification', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs'))
])
pipe_model.fit(X_train, y_train)

Pipeline(steps=[('feature_selection', SelectFromModel(estimator=LassoCV())),
                ('classification',
                 MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5),
                               max_iter=150, solver='lbfgs'))])

In [36]:
# Predict
preds = pipe_model.predict(X_val)
test_score = pipe_model.score(X_val, y_val)

target_names=['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))

print("test score", test_score)


Συνολικός χρόνος fit και predict: 582.035861492157 seconds
              precision    recall  f1-score   support

   home loss       0.68      0.54      0.60      1011
    home win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.70      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

test score 0.7012396694214876


### MLP Classifier with RFECV

In [22]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_val_rcv = valid_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_val_rcv = y_val

In [23]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_rcv, y_train_rcv)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [24]:
start_time = time.time()
grid.fit(X_train_rcv, y_train_rcv)
val_score = grid.score(X_val_rcv, y_val_rcv)

preds = grid.predict(X_val_rcv)
test_score = grid.score(X_val_rcv, y_val_rcv)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_rcv, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_rcv)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 14.21740198135376 seconds
              precision    recall  f1-score   support

   home_loss       0.63      0.61      0.62      1011
    home_win       0.73      0.75      0.74      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.6900826446280992
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 150, 'solver': 'lbfgs'}
best score: 0.656231884057971
test score 0.6900826446280992


### MLP Classifier with RFE

In [37]:
X_train_rfe = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'W_PCT_home',
            'HOME_RECORD_home', 'ROAD_RECORD_home', 'ROAD_RECORD_away',
            'WIN_PRCT_home_3g', 'PTS_home_3g', 'REB_home_3g', 'WIN_PRCT_away_3g',
            'AST_away_3g', 'REB_away_3g', 'PTS_home_7g', 'AST_home_7g',
            'REB_home_7g', 'AST_away_7g', 'diff_avg_pts_home', 'diff_avg_ast_home',
            'diff_avg_ast_away', 'diff_avg_fg3_pct_home', 'diff_avg_reb_home',
            'diff_avg_reb_away', 'top_players', 'eff_visitor', 'HG_7days',
            'AG_7days', 'G_7days', 'back2back', 'HG_7days_VISITOR',
            'G_7days_VISITOR', 'back2back_visitor', 'missing_players',
            'missing_players_visitor', 'home_elo', 'visitor_elo', 'elo_diff',
            'top_player_diff', 'missing_player_diff', 'eff_diff', 'month',
            'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
            'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_REB_away',
            'Home_Last_5_Avg_AST_away', 'Away_Last_5_Avg_FG3_PCT_home',
            'Away_Last_5_Avg_AST_home', 'Away_Last_5_Avg_PTS_away',
            'Away_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_AST_away',
            'diff_ast_last_3_games', 'diff_ast_last_7_games',
            'diff_reb_last_3_games', 'diff_reb_last_7_games',
            'diff_win_pct_3_last_games', 'diff_curr_win_pct',
            'diff_curr_home_record', 'diff_curr_away_record']]

y_train_rfe = y_train

X_val_rfe = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'W_PCT_home',
            'HOME_RECORD_home', 'ROAD_RECORD_home', 'ROAD_RECORD_away',
            'WIN_PRCT_home_3g', 'PTS_home_3g', 'REB_home_3g', 'WIN_PRCT_away_3g',
            'AST_away_3g', 'REB_away_3g', 'PTS_home_7g', 'AST_home_7g',
            'REB_home_7g', 'AST_away_7g', 'diff_avg_pts_home', 'diff_avg_ast_home',
            'diff_avg_ast_away', 'diff_avg_fg3_pct_home', 'diff_avg_reb_home',
            'diff_avg_reb_away', 'top_players', 'eff_visitor', 'HG_7days',
            'AG_7days', 'G_7days', 'back2back', 'HG_7days_VISITOR',
            'G_7days_VISITOR', 'back2back_visitor', 'missing_players',
            'missing_players_visitor', 'home_elo', 'visitor_elo', 'elo_diff',
            'top_player_diff', 'missing_player_diff', 'eff_diff', 'month',
            'Home_Last_5_Avg_AST_home', 'Home_Last_5_Avg_REB_home',
            'Home_Last_5_Avg_PTS_away', 'Home_Last_5_Avg_REB_away',
            'Home_Last_5_Avg_AST_away', 'Away_Last_5_Avg_FG3_PCT_home',
            'Away_Last_5_Avg_AST_home', 'Away_Last_5_Avg_PTS_away',
            'Away_Last_5_Avg_FT_PCT_away', 'Away_Last_5_Avg_AST_away',
            'diff_ast_last_3_games', 'diff_ast_last_7_games',
            'diff_reb_last_3_games', 'diff_reb_last_7_games',
            'diff_win_pct_3_last_games', 'diff_curr_win_pct',
            'diff_curr_home_record', 'diff_curr_away_record']]

y_val_rfe = y_val



In [38]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_rfe, y_train_rfe)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [39]:
start_time = time.time()
grid.fit(X_train_rfe, y_train_rfe)
val_score = grid.score(X_val_rfe, y_val_rfe)

preds = grid.predict(X_val_rfe)
test_score = grid.score(X_val_rfe, y_val_rfe)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_rfe, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_rfe)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 24.183472394943237 seconds
              precision    recall  f1-score   support

   home_loss       0.68      0.50      0.58      1011
    home_win       0.70      0.83      0.76      1409

    accuracy                           0.69      2420
   macro avg       0.69      0.67      0.67      2420
weighted avg       0.69      0.69      0.68      2420

val score: 0.6942148760330579
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 200, 'solver': 'lbfgs'}
best score: 0.6843478260869564
test score 0.6942148760330579


### MLP Classifier with ExtraTreesClassifier

In [40]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_train_extra = y_train

X_val_extra = valid_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_val_extra = y_val

In [41]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_extra, y_train_extra)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [42]:
start_time = time.time()
grid.fit(X_train_extra, y_train_extra)
val_score = grid.score(X_val_extra, y_val_extra)

preds = grid.predict(X_val_extra)
test_score = grid.score(X_val_extra, y_val_extra)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_extra, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_extra)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 13.973390340805054 seconds
              precision    recall  f1-score   support

   home_loss       0.00      0.00      0.00      1011
    home_win       0.58      1.00      0.74      1409

    accuracy                           0.58      2420
   macro avg       0.29      0.50      0.37      2420
weighted avg       0.34      0.58      0.43      2420

val score: 0.5822314049586776
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 100, 'solver': 'lbfgs'}
best score: 0.6326086956521739
test score 0.5822314049586776


### Standard Scaler

In [5]:
from sklearn.preprocessing import StandardScaler

# Scaling features
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_val_standard = scaler.transform(X_val)
X_test_standard = scaler.transform(X_test)

In [44]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_standard, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [45]:
start_time = time.time()
grid.fit(X_train_standard, y_train)
val_score = grid.score(X_val_standard, y_val)

preds = grid.predict(X_val_standard)
test_score = grid.score(X_val_standard, y_val)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_standard)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 108.6168303489685 seconds
              precision    recall  f1-score   support

   home loss       0.67      0.50      0.57      1011
    home win       0.70      0.83      0.76      1409

    accuracy                           0.69      2420
   macro avg       0.69      0.66      0.67      2420
weighted avg       0.69      0.69      0.68      2420

val score: 0.6900826446280992
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 150, 'solver': 'sgd'}
best score: 0.6791304347826087
test score 0.6900826446280992


### MinMax Scaler

In [46]:
from sklearn.preprocessing import MinMaxScaler

# Scaling features
scaler = MinMaxScaler()
X_train_minmax = scaler.fit_transform(X_train)
X_val_minmax = scaler.transform(X_val)
X_test_minmax = scaler.transform(X_test)

In [10]:
# # Split our data
# X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4)

# # Split Data to Train and Validation
# X_train, X_val, y_train, y_val = train_test_split(X_train_minmax, y_train, test_size=0.2)


In [47]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_minmax, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [48]:
start_time = time.time()
grid.fit(X_train_minmax, y_train)
val_score = grid.score(X_val_minmax, y_val)

preds = grid.predict(X_val_minmax)
test_score = grid.score(X_val_minmax, y_val)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_minmax)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 86.82926440238953 seconds
              precision    recall  f1-score   support

   home loss       0.66      0.56      0.60      1011
    home win       0.71      0.79      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.69      0.67      0.68      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.6933884297520662
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 100, 'solver': 'adam'}
best score: 0.6884057971014493
test score 0.6933884297520662


### PCA 

In [6]:
from sklearn.decomposition import PCA

# PCA selector
pca = PCA(n_components=30)

# Εφαρμόζουμε στα δεδομένα εκπαίδευσης και ελέγχου τον *ΙΔΙΟ* μετασχηματισμό
# Οι κύριες συνιστώσες υπολογίζονται στο train set
# Στα train κάνουμε fit_transform στο test μόνο transform:
trainPCA = pca.fit_transform(X_train_standard)
valPCA = pca.transform(X_val_standard)
testPCA = pca.transform(X_test_standard)

In [7]:
# # Θα τυπωσουμε το συσσωρευτικό ποσοστό διασποράς που εξηγείται από τις κύριες συνιστώσες
# evar = pca.explained_variance_ratio_
# cum_evar = np.cumsum(evar)
# print(cum_evar)
# plt.figure(1, figsize=(5, 5))
# plt.xlabel("Principal Component number")
# plt.ylabel('Cumulative Variance')
# plt.plot(cum_evar, linewidth=2)
# plt.show()

In [8]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(trainPCA, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [9]:
start_time = time.time()
grid.fit(trainPCA, y_train)
val_score = grid.score(valPCA, y_val)

preds = grid.predict(valPCA)
test_score = grid.score(valPCA, y_val)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(valPCA)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)

Συνολικός χρόνος fit και predict: 62.85004949569702 seconds
              precision    recall  f1-score   support

   home loss       0.67      0.51      0.58      1011
    home win       0.70      0.82      0.76      1409

    accuracy                           0.69      2420
   macro avg       0.69      0.67      0.67      2420
weighted avg       0.69      0.69      0.68      2420

val score: 0.6921487603305785
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 250, 'solver': 'sgd'}
best score: 0.68
test score 0.6921487603305785


### MLP with Sequential Feature Selection

#### Forward

In [10]:
X_train_for_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_train_for_sfs = y_train

X_val_for_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_val_for_sfs = y_val


In [11]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_for_sfs, y_train_for_sfs)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [13]:
start_time = time.time()
grid.fit(X_train_for_sfs, y_train_for_sfs)
val_score = grid.score(X_val_for_sfs, y_val_for_sfs)

preds = grid.predict(X_val_for_sfs)
test_score = grid.score(X_val_for_sfs, y_val_for_sfs)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_for_sfs, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_for_sfs)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 33.427122354507446 seconds
              precision    recall  f1-score   support

   home loss       0.67      0.55      0.60      1011
    home win       0.71      0.80      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420

val score: 0.6962809917355371
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 200, 'solver': 'lbfgs'}
best score: 0.6671014492753622
test score 0.6962809917355371


#### Backwards

In [14]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train

X_val_back_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_val_back_sfs = y_val

In [15]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_back_sfs, y_train_back_sfs)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [18]:
start_time = time.time()
grid.fit(X_train_back_sfs, y_train_back_sfs)
val_score = grid.score(X_val_back_sfs, y_val_back_sfs)

preds = grid.predict(X_val_back_sfs)
test_score = grid.score(X_val_back_sfs, y_val_for_sfs)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_back_sfs, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_val_back_sfs)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 36.65116882324219 seconds
              precision    recall  f1-score   support

   home loss       0.68      0.54      0.60      1011
    home win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

val score: 0.7008264462809918
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 250, 'solver': 'lbfgs'}
best score: 0.6684057971014492
test score 0.7008264462809918
