In [18]:
!pip install --upgrade pip #upgrade pip package installer
!pip install scikit-learn --upgrade #upgrade scikit-learn package
!pip install numpy --upgrade #upgrade numpy package
!pip install --upgrade matplotlib # Κάνουμε update την matplotlib

[0m

## MLP with GridSearchCV

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv('data5.csv')

In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

import time
import warnings


In [21]:
warnings.filterwarnings('ignore')

train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

X1 = df.drop(["game_date_est", "season", "game_id", "home_team", "visitor_team", "home_team_id", "visitor_team_id",
              "home_team_wins", "conference", "conference_visitor"], axis=1)
y1 = df["home_team_wins"]

# Split our data
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4)

# Split Data to Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)


In [22]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [23]:
start_time = time.time()
grid.fit(X_train, y_train)
val_score = grid.score(X_val, y_val)

preds = grid.predict(X_test)
test_score = grid.score(X_test, y_test)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_test)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 86.44088435173035 seconds
              precision    recall  f1-score   support

   home_loss       0.55      0.75      0.64      2577
    home_win       0.75      0.56      0.64      3515

    accuracy                           0.64      6092
   macro avg       0.65      0.65      0.64      6092
weighted avg       0.67      0.64      0.64      6092

val score: 0.6274617067833698
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 250, 'solver': 'adam'}
best score: 0.6866870600969484
test score 0.63887065003283


## Standard Scaler

In [64]:
from sklearn.preprocessing import StandardScaler

# Scaling features
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_val_standard = scaler.transform(X_val)
X_test_standard = scaler.transform(X_test)

In [50]:
# # Split our data
# X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4)

# # Split Data to Train and Validation
# X_train, X_val, y_train, y_val = train_test_split(X_train_standard, y_train, test_size=0.2)


In [65]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_standard, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [66]:
start_time = time.time()
grid.fit(X_train_standard, y_train)
val_score = grid.score(X_val_standard, y_val)

preds = grid.predict(X_test_standard)
test_score = grid.score(X_test_standard, y_test)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_test_standard)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 155.95430159568787 seconds
              precision    recall  f1-score   support

   home loss       0.41      1.00      0.58      2485
    home win       0.00      0.00      0.00      3607

    accuracy                           0.41      6092
   macro avg       0.20      0.50      0.29      6092
weighted avg       0.17      0.41      0.24      6092

val score: 0.5924507658643327
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 150, 'solver': 'sgd'}
best score: 0.575592771849201
test score 0.40791201575837166


## MinMax Scaler

In [67]:
from sklearn.preprocessing import MinMaxScaler

# Scaling features
scaler = MinMaxScaler()
X_train_minmax = scaler.fit_transform(X_train)
X_val_minmax = scaler.transform(X_val)
X_test_minmax = scaler.transform(X_test)

In [68]:
# # Split our data
# X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.4)

# # Split Data to Train and Validation
# X_train, X_val, y_train, y_val = train_test_split(X_train_minmax, y_train, test_size=0.2)


In [69]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(X_train_minmax, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [70]:
start_time = time.time()
grid.fit(X_train_minmax, y_train)
val_score = grid.score(X_val_minmax, y_val)

preds = grid.predict(X_test_minmax)
test_score = grid.score(X_test_minmax, y_test)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(X_test_minmax)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 100.45040130615234 seconds
              precision    recall  f1-score   support

   home loss       0.34      0.65      0.45      2485
    home win       0.33      0.12      0.18      3607

    accuracy                           0.34      6092
   macro avg       0.34      0.39      0.31      6092
weighted avg       0.34      0.34      0.29      6092

val score: 0.6039387308533917
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 250, 'solver': 'sgd'}
best score: 0.5829799127520737
test score 0.33749179251477346


## PCA 

In [74]:
from sklearn.decomposition import PCA

# Ορίζουμε την PCA και τον τελικό αριθμό features - αριθμό κύριων συνιστωσών
# είναι ακόμα μια υπερπαράμετρος με την οποία μπορούμε να πειραματιστούμε
n = 25
pca = PCA(n_components=n)

# Εφαρμόζουμε στα δεδομένα εκπαίδευσης και ελέγχου τον *ΙΔΙΟ* μετασχηματισμό
# Οι κύριες συνιστώσες υπολογίζονται στο train set
# Στα train κάνουμε fit_transform στο test μόνο transform:
trainPCA = pca.fit_transform(X_train_standard)
valPCA = pca.fit_transform(X_val_standard)
testPCA = pca.transform(X_test_standard)

print(X_train_standard.shape)
print(trainPCA.shape)
print("")
print(X_test_standard.shape)
print(testPCA.shape)

# πλεόν οι διαστάση των χαρακτηριστικών είναι 25

(7309, 107)
(7309, 25)

(6092, 107)
(6092, 25)


In [75]:
# Θα τυπωσουμε το συσσωρευτικό ποσοστό διασποράς που εξηγείται από τις κύριες συνιστώσες
evar = pca.explained_variance_ratio_
cum_evar = np.cumsum(evar)
print(cum_evar)
plt.figure(1, figsize=(5, 5))
plt.xlabel("Principal Component number")
plt.ylabel('Cumulative Variance')
plt.plot(cum_evar, linewidth=2)
plt.show()

[0.13008562 0.21717014 0.29347683 0.34587658 0.39056013 0.43275594
 0.47089361 0.5074371  0.53973804 0.56531289 0.58864616 0.61151888
 0.63142756 0.65044742 0.66827824 0.68560884 0.70161125 0.71710199
 0.73185903 0.74621478 0.75968346 0.7724581  0.78487294 0.79679512
 0.80737622]


ImportError: ignored

<Figure size 360x360 with 1 Axes>

In [76]:
# Multilayer Perceptron
model = MLPClassifier()
model.fit(trainPCA, y_train)

# defining parameter range
param_grid = {'max_iter': list(range(100, 300, 50)),
              # 'activation':['identity', 'logistic', 'tanh', 'relu'],
              'hidden_layer_sizes': [(20, 10, 5)],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [1e-05],
              }


grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)


In [77]:
start_time = time.time()
grid.fit(trainPCA, y_train)
val_score = grid.score(valPCA, y_val)

preds = grid.predict(testPCA)
test_score = grid.score(testPCA, y_test)

target_names = ['home loss', 'home win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)

model = grid.best_estimator_
y_fit = model.predict(testPCA)

print(grid.best_params_)
print("best score:", grid.best_score_)
print("test score", test_score)


Συνολικός χρόνος fit και predict: 107.99400305747986 seconds
              precision    recall  f1-score   support

   home loss       0.00      0.00      0.00      2485
    home win       0.59      1.00      0.74      3607

    accuracy                           0.59      6092
   macro avg       0.30      0.50      0.37      6092
weighted avg       0.35      0.59      0.44      6092

val score: 0.5940919037199125
{'alpha': 1e-05, 'hidden_layer_sizes': (20, 10, 5), 'max_iter': 150, 'solver': 'sgd'}
best score: 0.5810645408060554
test score 0.5920879842416283
