# Naive Bayes for predicting whether an NFL home team will win or lose

In [1]:
import pandas as pd

spreads_and_scores = pd.read_csv('spreads_and_scores_clean.csv')
spreads_and_scores

Unnamed: 0,schedule_season,schedule_week,schedule_playoff,team_home,team_away,spread_favorite,over_under_line,stadium,weather_temperature,weather_wind_mph,weather_humidity,team_favorite_away,team_favorite_home,total_score,did_home_team_win
0,0,1,0,0,0,-3.0,35.0,0,64.0,8.0,66.0,1,0,24.0,1
1,0,1,0,1,1,-1.0,34.0,1,72.0,9.0,81.0,1,0,29.0,0
2,0,1,0,2,2,-6.0,35.0,2,65.0,5.0,77.0,1,0,18.0,1
3,0,1,0,3,3,-3.0,42.0,3,82.0,10.0,58.0,0,1,41.0,0
4,0,1,0,4,4,-9.0,38.0,4,66.0,11.0,70.0,0,1,61.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10438,41,19,1,26,19,-4.0,48.5,51,35.0,0.0,43.0,0,1,35.0,0
10439,41,19,1,3,26,-2.5,54.0,3,35.0,6.0,54.0,0,1,78.0,1
10440,41,19,1,14,14,-3.0,48.0,48,50.0,11.0,69.0,0,1,57.0,0
10441,41,20,1,3,19,-7.0,54.5,3,41.0,4.0,41.0,0,1,51.0,0


Given how Naive Bayes works, we can think of constructing this model by calculating the probability that the home team wins given if it was favored or not. We can also factor in if the home team was a heavy favorite (more than one touchdown or favored by more than 7 points), mediumly favored (more than a field goal but not more than a touchdown, that is between 4 and 7 points) or very slightly favored (less than a field goal). We will create a new column called favored level to represent this. **Favored level will be 0 if slightly favored, 1 if mediumly favored or 2 if favored heavily.**

In [2]:
# favored level will be 0 if slightly favored, 1 if mediumly favored or 2 if favored heavily
def determine_spread_favorite_lvl(spread):
    if spread < 3:
        return 0
    elif spread >= 3 and spread <=7:
        return 1
    else:
        return 2
spreads_and_scores['favored_level'] = spreads_and_scores.apply(lambda row: determine_spread_favorite_lvl(abs(row['spread_favorite'])), axis=1)
spreads_and_scores


Unnamed: 0,schedule_season,schedule_week,schedule_playoff,team_home,team_away,spread_favorite,over_under_line,stadium,weather_temperature,weather_wind_mph,weather_humidity,team_favorite_away,team_favorite_home,total_score,did_home_team_win,favored_level
0,0,1,0,0,0,-3.0,35.0,0,64.0,8.0,66.0,1,0,24.0,1,1
1,0,1,0,1,1,-1.0,34.0,1,72.0,9.0,81.0,1,0,29.0,0,0
2,0,1,0,2,2,-6.0,35.0,2,65.0,5.0,77.0,1,0,18.0,1,1
3,0,1,0,3,3,-3.0,42.0,3,82.0,10.0,58.0,0,1,41.0,0,1
4,0,1,0,4,4,-9.0,38.0,4,66.0,11.0,70.0,0,1,61.0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10438,41,19,1,26,19,-4.0,48.5,51,35.0,0.0,43.0,0,1,35.0,0,1
10439,41,19,1,3,26,-2.5,54.0,3,35.0,6.0,54.0,0,1,78.0,1,0
10440,41,19,1,14,14,-3.0,48.0,48,50.0,11.0,69.0,0,1,57.0,0,1
10441,41,20,1,3,19,-7.0,54.5,3,41.0,4.0,41.0,0,1,51.0,0,1


In [3]:
features = spreads_and_scores[['team_favorite_home', 'favored_level']]
features.head()


Unnamed: 0,team_favorite_home,favored_level
0,0,1
1,0,0
2,0,1
3,1,1
4,1,2


In [4]:
labels = spreads_and_scores['did_home_team_win']
labels.head()

0    1
1    0
2    1
3    0
4    0
Name: did_home_team_win, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=5)

In [6]:
# Import Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_features, train_labels)
train_score = gnb.score(train_features, train_labels)
print("Accuracy on the training data is {:0.2f}%".format(train_score*100))

Accuracy on the training data is 65.49%


In [7]:
from sklearn.metrics import accuracy_score
test_score = accuracy_score(test_labels ,gnb.predict(test_features))
print("Accuracy on the test data is {:0.2f}%".format(test_score*100))

Accuracy on the test data is 66.92%


The accuracy for the model improved on the testing data. Overall the model gives around 66% accuracy which is a 1% increase from what we saw originally by performing logistic regression. 

In [11]:
for i in range(3,11):
    print(i)

3
4
5
6
7
8
9
10


In [21]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

cross_val_scores = {}

for i in range(3,11):
    k_fold = KFold(n_splits=i, shuffle=True, random_state=0)
    gnb = GaussianNB()
    cross_val_scores[i] = np.mean(cross_val_score(gnb, train_features, train_labels, cv=k_fold))
print(cross_val_scores)
highest_cross_val_score = max(cross_val_scores, key=cross_val_scores.get)
mean_cross_val_score = sum(cross_val_scores.values())/len(cross_val_scores)
print(f"Mean cross validation score: {mean_cross_val_score}")
print(f"Best cross val score with {highest_cross_val_score}-fold cross validation gives a score of {cross_val_scores[highest_cross_val_score]}")


{3: 0.6548958053887479, 4: 0.6548973573489305, 5: 0.65489487810734, 6: 0.6548976237234337, 7: 0.6548983491681053, 8: 0.6548983482740289, 9: 0.6548978941637899, 10: 0.6548984327994729}
Mean cross validation score: 0.6548973361217312
Best cross val score with 10-fold cross validation gives a score of 0.6548984327994729
