## Initialization

In [1]:
import pandas as pd
df = pd.read_csv('../../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest, SelectFromModel, mutual_info_classif
from keras.layers import Dense, Dropout, LSTM, Bidirectional, Embedding
from keras.models import Sequential


import numpy as np

import time

In [97]:
train_data = df.loc[(df.season < 2016) & (df.season >= 2007)]
test_data = df.loc[df.season == 2019]

X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


## GNB Testing per Season

In [98]:
X_train_sfm = train_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_sfm = y_train

X_test_sfm = test_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_sfm = y_test

win_accuracy = {}

In [99]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
# κάνουμε εκπαίδευση (fit) δηλαδή ουσιαστικά υπολογίζουμε μέση τιμή και διακύμανση για όλα τα χαρακτηριστικά και κλάσεις στο training set
model = gnb.fit(X_train_sfm, y_train_sfm)
# η GaussianNB έχει builtin μέθοδο υπολογισμό accuracy. Αποθηκεύουμε την τιμή της στον πίνακά μας με τα αποτελέσματα από τα άλλα classifiers
win_accuracy['gaussian naive bayes'] = gnb.score(X_test_sfm, y_test_sfm)
# και ξανατυπώνουμε τα sorted αποτελέσματα
print("Classification Accuracy on the NBA Games Dataset (40% test set)\n")
sorted_accuracy = [(k, win_accuracy[k]) for k in sorted(win_accuracy, key=win_accuracy.get, reverse=True)]
for k, v in sorted_accuracy:
  print(k,v)

Classification Accuracy on the NBA Games Dataset (40% test set)

gaussian naive bayes 0.6733193277310925


## kNN Testing per Season

In [100]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away']]
y_train_extra = y_train

X_test_extra = test_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away']]
y_test_extra = y_test

In [101]:
from sklearn.preprocessing import StandardScaler

# Scaling features
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train_extra)
X_test_standard = scaler.transform(X_test_extra)

In [102]:
import time

# Split Data to Train and Validation

estimator = KNeighborsClassifier(n_neighbors=72)

start_time = time.time()
estimator.fit(X_train_standard, y_train_extra)

preds = estimator.predict(X_test_standard)
test_score = estimator.score(X_test_standard, y_test_extra)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_extra, preds, target_names=target_names))
print("test score", test_score)

Συνολικός χρόνος fit και predict: 0.39203548431396484 seconds
              precision    recall  f1-score   support

   home_loss       0.67      0.55      0.60       429
    home_win       0.68      0.78      0.73       523

    accuracy                           0.68       952
   macro avg       0.68      0.66      0.67       952
weighted avg       0.68      0.68      0.67       952

test score 0.6764705882352942


## MLP Testing per Season

In [103]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away']]
y_train_extra = y_train

X_test_extra = test_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away']]
y_test_extra = y_test

In [104]:
from sklearn.preprocessing import StandardScaler

# Scaling features
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train_extra)
X_test_standard = scaler.transform(X_test_extra)

In [105]:
model = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=100, solver='sgd')
model.fit(X_train_standard, y_train_extra)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=100,
              solver='sgd')

In [106]:
start_time = time.time()

preds = model.predict(X_test_standard)
test_score = model.score(X_test_standard, y_test_extra)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_extra, preds, target_names=target_names))
print("test score", test_score)


Συνολικός χρόνος fit και predict: 0.003999471664428711 seconds
              precision    recall  f1-score   support

   home_loss       0.66      0.59      0.62       429
    home_win       0.69      0.75      0.72       523

    accuracy                           0.68       952
   macro avg       0.67      0.67      0.67       952
weighted avg       0.68      0.68      0.68       952

test score 0.6775210084033614


## SVM Testing per Season

In [107]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_test_rcv = test_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_test_rcv = y_test

In [108]:
# Support vector classifier
model = SVC(C=10, gamma=0.01, kernel='rbf')
model.fit(X_train_rcv, y_train_rcv)

SVC(C=10, gamma=0.01)

In [109]:
start_time = time.time()
model.fit(X_train_rcv, y_train_rcv)

preds = model.predict(X_test_rcv)
test_score = model.score(X_test_rcv, y_test_rcv)
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_rcv, preds, target_names=target_names))
print("test score", test_score)


Συνολικός χρόνος fit και predict: 5.6123151779174805 seconds
              precision    recall  f1-score   support

   home_loss       0.69      0.50      0.58       429
    home_win       0.67      0.82      0.73       523

    accuracy                           0.67       952
   macro avg       0.68      0.66      0.66       952
weighted avg       0.68      0.67      0.66       952

test score 0.6743697478991597


## Random Forest Testing per Season

In [110]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_test_rcv = test_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_test_rcv = y_test

In [111]:
estimator = RandomForestClassifier(bootstrap=True, max_depth=8, max_features='auto', 
min_samples_leaf=4, min_samples_split=8, n_estimators=100)
estimator.fit(X_train_rcv, y_train_rcv)

RandomForestClassifier(max_depth=8, min_samples_leaf=4, min_samples_split=8)

In [112]:
# Predict
preds = estimator.predict(X_test_rcv)
test_score = estimator.score(X_test_rcv, y_test_rcv)

target_names=['home loss', 'home win']

start_time = time.time()

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_rcv, preds, target_names=target_names))

print("test score", test_score)

Συνολικός χρόνος fit και predict: 0.0 seconds
              precision    recall  f1-score   support

   home loss       0.66      0.57      0.61       429
    home win       0.68      0.75      0.72       523

    accuracy                           0.67       952
   macro avg       0.67      0.66      0.66       952
weighted avg       0.67      0.67      0.67       952

test score 0.6722689075630253


## XGBoost Testing per Season

In [113]:
X_train_lasso = train_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train


X_test_lasso = test_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]

y_test_lasso = y_test

In [114]:
# XG Boost classifier
model = xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, 
min_child_weight=3, n_estimators=100)
model.fit(X_train_lasso, y_train_lasso)

start_time = time.time()

preds = model.predict(X_test_lasso)
test_score = model.score(X_test_lasso, y_test_lasso)



In [115]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_lasso, preds, target_names=target_names))
print("test score", test_score)


Συνολικός χρόνος fit και predict: 1.341022253036499 seconds
              precision    recall  f1-score   support

   home_loss       0.67      0.58      0.62       429
    home_win       0.69      0.76      0.72       523

    accuracy                           0.68       952
   macro avg       0.68      0.67      0.67       952
weighted avg       0.68      0.68      0.68       952

test score 0.6817226890756303


## 2-Stage Stacking Testing per Season

In [116]:
X_train_lasso = train_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train


X_test_lasso = test_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_lasso = y_test

In [117]:
clf = [ ('svm', SVC(C=1, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]


ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')

stacking_model = StackingClassifier(estimators=clf, final_estimator=mlp, stack_method='auto', n_jobs=-1)

In [118]:
start_time = time.time()
stacking_model.fit(X_train_lasso, y_train_lasso)

preds = stacking_model.predict(X_test_lasso)
test_score = stacking_model.score(X_test_lasso, y_test_lasso)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_lasso, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 27.39257788658142 seconds
              precision    recall  f1-score   support

   home_loss       0.66      0.61      0.63       429
    home_win       0.70      0.75      0.72       523

    accuracy                           0.68       952
   macro avg       0.68      0.68      0.68       952
weighted avg       0.68      0.68      0.68       952

test score: 0.6827731092436975


## 3-Stage Stacking Testing per Season

In [119]:
X_train_lasso = train_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train


X_test_lasso = test_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_lasso = y_test

In [120]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Scaling features
scaler = MinMaxScaler()
X_train_standard = scaler.fit_transform(X_train_lasso)
X_test_standard = scaler.transform(X_test_lasso)

In [121]:
layer_one_estimators = [ 
        ('svm', SVC(C=1, gamma=0.0001, kernel='rbf')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
        # ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100))
        ]   
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')


layer_two = StackingClassifier(estimators=layer_one_estimators, final_estimator=ada)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_two) 

In [122]:
start_time = time.time()
stacking_model.fit(X_train_standard, y_train_lasso)

preds = stacking_model.predict(X_test_standard)
test_score = stacking_model.score(X_test_standard, y_test_lasso)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_lasso, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 28.241799116134644 seconds
              precision    recall  f1-score   support

   home_loss       0.63      0.69      0.66       429
    home_win       0.72      0.67      0.70       523

    accuracy                           0.68       952
   macro avg       0.68      0.68      0.68       952
weighted avg       0.68      0.68      0.68       952

test score: 0.6785714285714286


## Voting Testing per Season

In [123]:
X_train_lasso = train_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train

X_test_lasso = test_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_lasso = y_test

In [124]:
# create a voting classifier with hard voting
second_voting_classifier_hard = VotingClassifier(
    estimators = [
            # ('svm', SVC(C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(n_neighbors=72)),
                  # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=200, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
                  # ('gnb', GaussianNB())
                  ],
    voting='hard', weights=[2,1,1])


# make predictions with the hard voting model
second_voting_classifier_hard.fit(X_train_lasso, y_train_lasso)
y_pred_vch = second_voting_classifier_hard.predict(X_test_lasso)

In [96]:
test_score = second_voting_classifier_hard.score(X_test_lasso, y_test_lasso)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_lasso, y_pred_vch, target_names=target_names))
print("test score:", test_score)

Συνολικός χρόνος fit και predict: 55.9946665763855 seconds
              precision    recall  f1-score   support

   home_loss       0.62      0.58      0.60       495
    home_win       0.72      0.76      0.74       716

    accuracy                           0.69      1211
   macro avg       0.67      0.67      0.67      1211
weighted avg       0.68      0.69      0.68      1211

test score: 0.685383980181668


## Pipelines Testing per Season

In [18]:
pipe_model = Pipeline([
    ('selector', SelectFromModel(RandomForestClassifier())),
    ('scaler', StandardScaler()),
    # ('normalizer', MinMaxScaler()),
    ('classifier', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', min_samples_leaf=4, min_samples_split=12, n_estimators=100))
])

pipe_model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 SelectFromModel(estimator=RandomForestClassifier())),
                ('scaler', StandardScaler()),
                ('classifier',
                 RandomForestClassifier(max_depth=8, max_features='sqrt',
                                        min_samples_leaf=4,
                                        min_samples_split=12))])

In [19]:
# Predict
preds = pipe_model.predict(X_test)
test_score = pipe_model.score(X_test, y_test)

target_names=['home loss', 'home win']

print(classification_report(y_test, preds, target_names=target_names))

print("test score", test_score)


              precision    recall  f1-score   support

   home loss       0.68      0.54      0.60       429
    home win       0.68      0.79      0.73       523

    accuracy                           0.68       952
   macro avg       0.68      0.67      0.67       952
weighted avg       0.68      0.68      0.67       952

test score 0.6785714285714286


## ANN_LSTM Testing per Season

In [15]:
model = Sequential()
model.add(LSTM(50, activation='tanh', recurrent_dropout=0.2, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(50, activation='relu', recurrent_dropout=0.2, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

In [16]:
arr_X_train = X_train.to_numpy()
arr_X_train = arr_X_train.reshape(arr_X_train.shape[0], arr_X_train.shape[1], 1)

arr_X_test = X_test.to_numpy()
arr_X_test = arr_X_test.reshape(arr_X_test.shape[0], arr_X_test.shape[1], 1)

In [17]:
model.fit(arr_X_train, y_train, epochs=100, batch_size=32)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x180090e9130>

In [18]:
# evaluate the keras model
scores = model.evaluate(arr_X_test, y_test, verbose=0)
print('Accuracy: %.2f' % (scores[1]*100))

Accuracy: 65.59
