# Gaussian Naive Bayes Classifier

In [1]:
import pandas as pd
df = pd.read_csv('../../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [4]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins



### GNB with Low Correlation Data

In [5]:
# X_train_cor = train_data[['HG_7days','AG_7days','back2back','HG_7days_VISITOR','AG_7days_VISITOR','back2back_visitor',
# 'home_elo','visitor_elo','odds_home','odds_away','elo_diff','diff_avg_pts_home','diff_avg_pts_away',
# 'diff_avg_fg3_pct_away','diff_avg_reb_home']]
# y_train_cor = y_train

# X_test_cor = test_data[['HG_7days','AG_7days','back2back','HG_7days_VISITOR','AG_7days_VISITOR','back2back_visitor',
# 'home_elo','visitor_elo','odds_home','odds_away','elo_diff','diff_avg_pts_home','diff_avg_pts_away',
# 'diff_avg_fg3_pct_away','diff_avg_reb_home']]
# y_test_cor = y_test

# win_accuracy = {}

In [14]:
X_train_cor = train_data[['HG_7days','HG_7days_VISITOR',
'odds_home','odds_away','diff_avg_pts_home','diff_avg_pts_away',
'diff_avg_fg3_pct_away','diff_avg_reb_home']]
y_train_cor = y_train

X_test_cor = test_data[['HG_7days','HG_7days_VISITOR',
'odds_home','odds_away','diff_avg_pts_home','diff_avg_pts_away',
'diff_avg_fg3_pct_away','diff_avg_reb_home']]
y_test_cor = y_test

win_accuracy = {}

In [15]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
# κάνουμε εκπαίδευση (fit) δηλαδή ουσιαστικά υπολογίζουμε μέση τιμή και διακύμανση για όλα τα χαρακτηριστικά και κλάσεις στο training set
model = gnb.fit(X_train_cor, y_train_cor)
# η GaussianNB έχει builtin μέθοδο υπολογισμό accuracy. Αποθηκεύουμε την τιμή της στον πίνακά μας με τα αποτελέσματα από τα άλλα classifiers
win_accuracy['gaussian naive bayes'] = gnb.score(X_test_cor, y_test_cor)
# και ξανατυπώνουμε τα sorted αποτελέσματα
print("Classification Accuracy on the NBA Games Dataset (40% test set)\n")
sorted_accuracy = [(k, win_accuracy[k]) for k in sorted(win_accuracy, key=win_accuracy.get, reverse=True)]
for k, v in sorted_accuracy:
  print(k,v)

Classification Accuracy on the NBA Games Dataset (40% test set)

gaussian naive bayes 0.6685577132882391


In [16]:
# making predictions on the test set
y_pred = gnb.predict(X_test_cor)
target_names = ['home_loss', 'home_win']

# comparing actual response values (y_test) with predicted response values (y_pred)
print("Gaussian Naive Bayes model accuracy(in %):", accuracy_score(y_test_cor, y_pred))
print(f"These are the results for the model\n")
print(classification_report(y_test_cor, y_pred, target_names=target_names))

Gaussian Naive Bayes model accuracy(in %): 0.6685577132882391
These are the results for the model

              precision    recall  f1-score   support

   home_loss       0.63      0.53      0.58      1935
    home_win       0.69      0.77      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.65      4583
weighted avg       0.66      0.67      0.66      4583



### GNB with PCA and Standard Scaling

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [20]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [50]:
# Choose number of components = 30 based on a feature selection coding process we already executed
n_components = 15

pca = PCA(n_components=n_components).fit(X_train)
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

In [51]:
win_accuracy = {}

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
# κάνουμε εκπαίδευση (fit) δηλαδή ουσιαστικά υπολογίζουμε μέση τιμή και διακύμανση για όλα τα χαρακτηριστικά και κλάσεις στο training set
model = gnb.fit(X_train_pca, y_train)
# η GaussianNB έχει builtin μέθοδο υπολογισμό accuracy. Αποθηκεύουμε την τιμή της στον πίνακά μας με τα αποτελέσματα από τα άλλα classifiers
win_accuracy['gaussian naive bayes'] = gnb.score(X_test_pca, y_test)
# και ξανατυπώνουμε τα sorted αποτελέσματα
print("Classification Accuracy on the NBA Games Dataset (40% test set)\n")
sorted_accuracy = [(k, win_accuracy[k]) for k in sorted(win_accuracy, key=win_accuracy.get, reverse=True)]
for k, v in sorted_accuracy:
  print(k,v)

Classification Accuracy on the NBA Games Dataset (40% test set)

gaussian naive bayes 0.6646301549203578


In [52]:
# making predictions on the testidation set
y_pred = gnb.predict(X_test_pca)
target_names = ['home_loss', 'home_win']

# comparing actual response values (y_test) with predicted response values (y_pred)
print("Gaussian Naive Bayes model accuracy(in %):", accuracy_score(y_test, y_pred))
print(f"These are the results for the model\n")
print(classification_report(y_test, y_pred, target_names=target_names))

Gaussian Naive Bayes model accuracy(in %): 0.6646301549203578
These are the results for the model

              precision    recall  f1-score   support

   home_loss       0.66      0.43      0.52      1935
    home_win       0.67      0.83      0.74      2648

    accuracy                           0.66      4583
   macro avg       0.66      0.63      0.63      4583
weighted avg       0.66      0.66      0.65      4583



### GNB with SelectFromModel(LassoCV)

Already executed and gave the features

In [13]:
X_train_sfm = train_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_sfm = y_train

X_test_sfm = test_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_sfm = y_test

win_accuracy = {}

In [14]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
# κάνουμε εκπαίδευση (fit) δηλαδή ουσιαστικά υπολογίζουμε μέση τιμή και διακύμανση για όλα τα χαρακτηριστικά και κλάσεις στο training set
model = gnb.fit(X_train_sfm, y_train_sfm)
# η GaussianNB έχει builtin μέθοδο υπολογισμό accuracy. Αποθηκεύουμε την τιμή της στον πίνακά μας με τα αποτελέσματα από τα άλλα classifiers
win_accuracy['gaussian naive bayes'] = gnb.score(X_test_sfm, y_test_sfm)
# και ξανατυπώνουμε τα sorted αποτελέσματα
print("Classification Accuracy on the NBA Games Dataset (40% test set)\n")
sorted_accuracy = [(k, win_accuracy[k]) for k in sorted(win_accuracy, key=win_accuracy.get, reverse=True)]
for k, v in sorted_accuracy:
  print(k,v)

Classification Accuracy on the NBA Games Dataset (40% test set)

gaussian naive bayes 0.6711760855334934


In [15]:
# making predictions on the test set
y_pred = gnb.predict(X_test_sfm)
target_names = ['home_loss', 'home_win']

# comparing actual response values (y_test) with predicted response values (y_pred)
print("Gaussian Naive Bayes model accuracy(in %):", accuracy_score(y_test_sfm, y_pred))
print(f"These are the results for the model\n")
print(classification_report(y_test_sfm, y_pred, target_names=target_names))

Gaussian Naive Bayes model accuracy(in %): 0.6711760855334934
These are the results for the model

              precision    recall  f1-score   support

   home_loss       0.61      0.64      0.62      1935
    home_win       0.72      0.70      0.71      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.67      0.67      4583
weighted avg       0.67      0.67      0.67      4583

