## MLP with GridSearchCV

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

import time

import warnings
warnings.filterwarnings('ignore')


In [3]:

train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

X1 = df.drop(["game_date_est", "season", "game_id", "home_team", "visitor_team", "home_team_id", "visitor_team_id",
              "home_team_wins", "conference", "conference_visitor"], axis=1)
y1 = df["home_team_wins"]

# Split our data
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4)

# Split Data to Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)


In [4]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [5]:
start_time = time.time()
grid.fit(X_train, y_train)
val_score = grid.score(X_val, y_val)

preds = grid.predict(X_test)
test_score = grid.score(X_test, y_test)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_test)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 31.00773811340332 seconds
              precision    recall  f1-score   support

   home_loss       0.93      0.93      0.93      2344
    home_win       0.95      0.95      0.95      3218

    accuracy                           0.94      5562
   macro avg       0.94      0.94      0.94      5562
weighted avg       0.94      0.94      0.94      5562

val score: 0.9532654284002396
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 250, 'solver': 'lbfgs'}
best score: 0.9860626989875849
test score 0.9415677813736066


## Standard Scaler

In [6]:
from sklearn.preprocessing import StandardScaler

# Scaling features
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_val_standard = scaler.transform(X_val)
X_test_standard = scaler.transform(X_test)

In [7]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_standard, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [8]:
start_time = time.time()
grid.fit(X_train_standard, y_train)
val_score = grid.score(X_val_standard, y_val)

preds = grid.predict(X_test_standard)
test_score = grid.score(X_test_standard, y_test)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_test_standard)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 41.817800521850586 seconds
              precision    recall  f1-score   support

   home loss       0.99      0.99      0.99      2344
    home win       0.99      0.99      0.99      3218

    accuracy                           0.99      5562
   macro avg       0.99      0.99      0.99      5562
weighted avg       0.99      0.99      0.99      5562

val score: 0.9898142600359496
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 150, 'solver': 'lbfgs'}
best score: 0.9896586538191577
test score 0.9902912621359223


## MinMax Scaler

In [9]:
from sklearn.preprocessing import MinMaxScaler

# Scaling features
scaler = MinMaxScaler()
X_train_minmax = scaler.fit_transform(X_train)
X_val_minmax = scaler.transform(X_val)
X_test_minmax = scaler.transform(X_test)

In [10]:
# # Split our data
# X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4)

# # Split Data to Train and Validation
# X_train, X_val, y_train, y_val = train_test_split(X_train_minmax, y_train, test_size=0.2)


In [11]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_minmax, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [12]:
start_time = time.time()
grid.fit(X_train_minmax, y_train)
val_score = grid.score(X_val_minmax, y_val)

preds = grid.predict(X_test_minmax)
test_score = grid.score(X_test_minmax, y_test)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_test_minmax)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 78.05638289451599 seconds
              precision    recall  f1-score   support

   home loss       1.00      1.00      1.00      2344
    home win       1.00      1.00      1.00      3218

    accuracy                           1.00      5562
   macro avg       1.00      1.00      1.00      5562
weighted avg       1.00      1.00      1.00      5562

val score: 0.9964050329538646
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 250, 'solver': 'lbfgs'}
best score: 0.9983512738013015
test score 0.9989212513484358


## PCA 

In [13]:
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

# PCA selector
pca = PCA(n_components=30)

# Εφαρμόζουμε στα δεδομένα εκπαίδευσης και ελέγχου τον *ΙΔΙΟ* μετασχηματισμό
# Οι κύριες συνιστώσες υπολογίζονται στο train set
# Στα train κάνουμε fit_transform στο test μόνο transform:
trainPCA = pca.fit_transform(X_train_standard)
valPCA = pca.transform(X_val_standard)
testPCA = pca.transform(X_test_standard)

# Multilayer Perceptron
estimator = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')

# model = make_pipeline(pca, estimator)
# model.fit(X_train, y_train)

In [14]:
# # Θα τυπωσουμε το συσσωρευτικό ποσοστό διασποράς που εξηγείται από τις κύριες συνιστώσες
# evar = pca.explained_variance_ratio_
# cum_evar = np.cumsum(evar)
# print(cum_evar)
# plt.figure(1, figsize=(5, 5))
# plt.xlabel("Principal Component number")
# plt.ylabel('Cumulative Variance')
# plt.plot(cum_evar, linewidth=2)
# plt.show()

In [15]:
# # defining parameter range
# param_grid = {'pca__n_components': [5, 20],
#               'estimator__max_iter': list(range(100, 300, 50)),
#               # 'activation':['identity', 'logistic', 'tanh', 'relu'],
#               'estimator__hidden_layer_sizes': [(20, 10, 5)],
#               'estimator__solver': ['lbfgs', 'sgd', 'adam'],
#               'estimator__alpha': [1e-05],
#               }


# grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [16]:
start_time = time.time()
estimator.fit(trainPCA, y_train)
val_score = estimator.score(valPCA, y_val)

preds = estimator.predict(testPCA)
test_score = estimator.score(testPCA, y_test)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

y_fit = estimator.predict(testPCA)

print("test score", test_score)


Συνολικός χρόνος fit και predict: 3.2519948482513428 seconds
              precision    recall  f1-score   support

   home loss       0.62      0.43      0.51      2344
    home win       0.66      0.81      0.73      3218

    accuracy                           0.65      5562
   macro avg       0.64      0.62      0.62      5562
weighted avg       0.65      0.65      0.64      5562

val score: 0.6728579988016776
test score 0.6503056454512766
