## Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report


import warnings
warnings.filterwarnings('ignore')

In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [6]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


#### Logistic Regression with GridSearchCV

In [7]:
import time

clf = LogisticRegression()
# η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή
params = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
          'C'       : np.logspace(-3,3,7),
          'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
        }

estimator = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
estimator.fit(X_train, y_train)
val_score = estimator.score(X_val, y_val)

preds = estimator.predict(X_val)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)
print(estimator.best_estimator_)
print(estimator.best_params_)
print("best score:", estimator.best_score_)
# print("test score", test_score)



Συνολικός χρόνος fit και predict: 540.8597717285156 seconds
              precision    recall  f1-score   support

   home_loss       0.67      0.55      0.60      1011
    home_win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

val score: 0.6991735537190082
LogisticRegression(C=0.1, solver='liblinear')
{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
best score: 0.6904347826086956


#### Standard Scaler

In [8]:
from sklearn.preprocessing import StandardScaler

# Scaling features
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_val_standard = scaler.transform(X_val)
X_test_standard = scaler.transform(X_test)

In [9]:
import time

clf = LogisticRegression()
# η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή
params = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
          'C'       : np.logspace(-3,3,7),
          'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
        }

estimator = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
estimator.fit(X_train_standard, y_train)
val_score = estimator.score(X_val_standard, y_val)

preds = estimator.predict(X_val_standard)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)
print(estimator.best_estimator_)
print(estimator.best_params_)
print("best score:", estimator.best_score_)
# print("test score", test_score)



Συνολικός χρόνος fit και predict: 105.65280151367188 seconds
              precision    recall  f1-score   support

   home_loss       0.67      0.54      0.60      1011
    home_win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

val score: 0.6983471074380165
LogisticRegression(C=0.1, penalty='l1', solver='liblinear')
{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
best score: 0.6921739130434783


#### MinMax Scaler

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Scaling features
scaler = MinMaxScaler()
X_train_minmax = scaler.fit_transform(X_train)
X_val_minmax = scaler.transform(X_val)
X_test_minmax = scaler.transform(X_test)


In [11]:
import time

clf = LogisticRegression()
# η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή
params = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
          'C'       : np.logspace(-3,3,7),
          'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
        }

estimator = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
estimator.fit(X_train_minmax, y_train)
val_score = estimator.score(X_val_minmax, y_val)

preds = estimator.predict(X_val_minmax)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)
print(estimator.best_estimator_)
print(estimator.best_params_)
print("best score:", estimator.best_score_)
# print("test score", test_score)



Συνολικός χρόνος fit και predict: 109.88302087783813 seconds
              precision    recall  f1-score   support

   home_loss       0.67      0.53      0.59      1011
    home_win       0.71      0.82      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.67      0.68      2420
weighted avg       0.69      0.70      0.69      2420

val score: 0.6966942148760331
LogisticRegression(penalty='l1', solver='liblinear')
{'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
best score: 0.6915942028985507


#### PCA

In [12]:
from sklearn.decomposition import PCA

# PCA selector
pca = PCA(n_components=30)

# Εφαρμόζουμε στα δεδομένα εκπαίδευσης και ελέγχου τον *ΙΔΙΟ* μετασχηματισμό
# Οι κύριες συνιστώσες υπολογίζονται στο train set
# Στα train κάνουμε fit_transform στο test μόνο transform:
trainPCA = pca.fit_transform(X_train_standard)
valPCA = pca.transform(X_val_standard)
testPCA = pca.transform(X_test_standard)

In [13]:
import time

clf = LogisticRegression()
# η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή
params = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
          'C'       : np.logspace(-3,3,7),
          'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
        }

estimator = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
estimator.fit(trainPCA, y_train)
val_score = estimator.score(valPCA, y_val)

preds = estimator.predict(valPCA)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)
print(estimator.best_estimator_)
print(estimator.best_params_)
print("best score:", estimator.best_score_)
# print("test score", test_score)



Συνολικός χρόνος fit και predict: 5.75942850112915 seconds
              precision    recall  f1-score   support

   home_loss       0.68      0.52      0.59      1011
    home_win       0.70      0.82      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.67      0.67      2420
weighted avg       0.69      0.70      0.69      2420

val score: 0.6958677685950413
LogisticRegression(C=0.01, solver='liblinear')
{'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
best score: 0.6844927536231884


#### Univariate Feature Selection Dataset

In [14]:
X_train_uni = train_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_train_uni = y_train

X_val_uni = valid_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_val_uni = y_val

In [15]:
import time

clf = LogisticRegression()
# η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή
params = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
          'C'       : np.logspace(-3,3,7),
          'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
        }

estimator = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
estimator.fit(X_train_uni, y_train_uni)
val_score = estimator.score(X_val_uni, y_val_uni)

preds = estimator.predict(X_val_uni)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_uni, preds, target_names=target_names))
print("val score:", val_score)
print(estimator.best_estimator_)
print(estimator.best_params_)
print("best score:", estimator.best_score_)
# print("test score", test_score)



Συνολικός χρόνος fit και predict: 3.7479801177978516 seconds
              precision    recall  f1-score   support

   home_loss       0.67      0.54      0.60      1011
    home_win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.67      0.68      2420
weighted avg       0.69      0.70      0.69      2420

val score: 0.6962809917355371
LogisticRegression(C=0.1, solver='liblinear')
{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
best score: 0.6933333333333334


#### Extra Trees Dataset

In [16]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_train_extra = y_train

X_val_extra = valid_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_val_extra = y_val

In [17]:
import time

clf = LogisticRegression()
# η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή
params = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
          'C'       : np.logspace(-3,3,7),
          'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
        }

estimator = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
estimator.fit(X_train_extra, y_train_extra)
val_score = estimator.score(X_val_extra, y_val_extra)

preds = estimator.predict(X_val_extra)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_extra, preds, target_names=target_names))
print("val score:", val_score)
print(estimator.best_estimator_)
print(estimator.best_params_)
print("best score:", estimator.best_score_)
# print("test score", test_score)



Συνολικός χρόνος fit και predict: 35.72837591171265 seconds
              precision    recall  f1-score   support

   home_loss       0.68      0.54      0.60      1011
    home_win       0.71      0.82      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

val score: 0.6991735537190082
LogisticRegression(C=100.0)
{'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
best score: 0.6936231884057971


#### RFECV Dataset

In [18]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_val_rcv = valid_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_val_rcv = y_val

In [19]:
import time

clf = LogisticRegression()
# η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή
params = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
          'C'       : np.logspace(-3,3,7),
          'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
        }

estimator = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
estimator.fit(X_train_rcv, y_train_rcv)
val_score = estimator.score(X_val_rcv, y_val_rcv)

preds = estimator.predict(X_val_rcv)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_rcv, preds, target_names=target_names))
print("val score:", val_score)
print(estimator.best_estimator_)
print(estimator.best_params_)
print("best score:", estimator.best_score_)
# print("test score", test_score)



Συνολικός χρόνος fit και predict: 5.47115159034729 seconds
              precision    recall  f1-score   support

   home_loss       0.68      0.55      0.61      1011
    home_win       0.72      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.70      0.68      0.68      2420
weighted avg       0.70      0.70      0.70      2420

val score: 0.7020661157024793
LogisticRegression(C=0.01, solver='newton-cg')
{'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
best score: 0.6947826086956521


#### Lasso Dataset

In [20]:
X_train_lasso = train_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train

X_val_lasso = valid_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_val_lasso = y_val

In [21]:
import time

clf = LogisticRegression()
# η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή
params = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
          'C'       : np.logspace(-3,3,7),
          'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
        }

estimator = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
estimator.fit(X_train_lasso, y_train_lasso)
val_score = estimator.score(X_val_lasso, y_val_lasso)

preds = estimator.predict(X_val_lasso)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_lasso, preds, target_names=target_names))
print("val score:", val_score)
print(estimator.best_estimator_)
print(estimator.best_params_)
print("best score:", estimator.best_score_)
# print("test score", test_score)



Συνολικός χρόνος fit και predict: 12.823372840881348 seconds
              precision    recall  f1-score   support

   home_loss       0.67      0.55      0.60      1011
    home_win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

val score: 0.6987603305785124
LogisticRegression(C=0.1, solver='liblinear')
{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
best score: 0.6942028985507246


#### SFS Forward Dataset

In [22]:
X_train_for_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_train_for_sfs = y_train

X_val_for_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_val_for_sfs = y_val


In [23]:
import time

clf = LogisticRegression()
# η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή
params = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
          'C'       : np.logspace(-3,3,7),
          'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
        }

estimator = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
estimator.fit(X_train_for_sfs, y_train_for_sfs)
val_score = estimator.score(X_val_for_sfs, y_val_for_sfs)

preds = estimator.predict(X_val_for_sfs)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_for_sfs, preds, target_names=target_names))
print("val score:", val_score)
print(estimator.best_estimator_)
print(estimator.best_params_)
print("best score:", estimator.best_score_)
# print("test score", test_score)



Συνολικός χρόνος fit και predict: 196.57188081741333 seconds
              precision    recall  f1-score   support

   home_loss       0.67      0.55      0.61      1011
    home_win       0.71      0.80      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420

val score: 0.6979338842975207
LogisticRegression(C=1000.0, solver='newton-cg')
{'C': 1000.0, 'penalty': 'l2', 'solver': 'newton-cg'}
best score: 0.6943478260869566


#### SFS Backwards Dataset

In [24]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train

X_val_back_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_val_back_sfs = y_val

In [25]:
import time

clf = LogisticRegression()
# η παράμετρος n_jobs = 1 χρησιμοποιεί όλους τους πυρήνες του υπολογιστή
params = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
          'C'       : np.logspace(-3,3,7),
          'solver'  : ['newton-cg', 'lbfgs', 'liblinear']
        }

estimator = GridSearchCV(clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
start_time = time.time()
estimator.fit(X_train_back_sfs, y_train_back_sfs)
val_score = estimator.score(X_val_back_sfs, y_val_back_sfs)

preds = estimator.predict(X_val_back_sfs)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_back_sfs, preds, target_names=target_names))
print("val score:", val_score)
print(estimator.best_estimator_)
print(estimator.best_params_)
print("best score:", estimator.best_score_)
# print("test score", test_score)



Συνολικός χρόνος fit και predict: 202.25084853172302 seconds
              precision    recall  f1-score   support

   home_loss       0.68      0.55      0.61      1011
    home_win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.70      0.68      0.68      2420
weighted avg       0.70      0.70      0.70      2420

val score: 0.7024793388429752
LogisticRegression(solver='liblinear')
{'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
best score: 0.6968115942028985
