In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [6]:
columns = [
    "turnovers", "score", "possession", "home_advantage"]

target = ["winner"]

In [16]:
file_path = Path('final_nfl_data.csv')
df = pd.read_csv(file_path)

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.reset_index(inplace=True, drop=True)

df = df.drop(columns=['date', 'team_id', 'name'])

df.head()

Unnamed: 0,home_advantage,turnovers,possession,score,winner
0,0,1,27:32,16,1
1,1,3,32:28,13,0
2,0,1,25:36,23,0
3,0,1,30:55,34,0
4,0,1,26:09,7,0


In [17]:
# Create our features
x = pd.get_dummies(df.drop(columns='winner'))
# Create our target
y = df['winner']

In [18]:
x.describe()

Unnamed: 0,home_advantage,turnovers,score,possession_14:45,possession_14:53,possession_15:02,possession_15:38,possession_15:44,possession_15:49,possession_16:21,...,possession_44:58,possession_45:07,possession_45:15,possession_45:23,possession_45:33,possession_45:50,possession_46:04,possession_46:13,possession_46:21,possession_47:08
count,10714.0,10714.0,10714.0,10714.0,10714.0,10714.0,10714.0,10714.0,10714.0,10714.0,...,10714.0,10714.0,10714.0,10714.0,10714.0,10714.0,10714.0,10714.0,10714.0,10714.0
mean,0.5,1.564215,22.282341,9.3e-05,9.3e-05,9.3e-05,9.3e-05,9.3e-05,9.3e-05,9.3e-05,...,9.3e-05,9.3e-05,9.3e-05,9.3e-05,9.3e-05,9.3e-05,9.3e-05,9.3e-05,9.3e-05,9.3e-05
std,0.500023,1.32888,10.279592,0.009661,0.009661,0.009661,0.009661,0.009661,0.009661,0.009661,...,0.009661,0.009661,0.009661,0.009661,0.009661,0.009661,0.009661,0.009661,0.009661,0.009661
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.5,1.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,2.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,8.0,62.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
# Check the balance of our target values
y.value_counts()

0    5369
1    5345
Name: winner, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1, stratify=y)
X_train.shape

(8035, 1367)

In [22]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [25]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=1, solver='lbfgs', warm_start=False)

LogisticRegression(multi_class='warn', penalty='12', random_state=1)

In [26]:
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [27]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
10271,1,1
6343,0,0
9144,1,1
4512,1,1
10236,1,1
...,...,...
8351,0,1
7443,1,1
8084,1,1
7226,0,1


In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.8010451661067562

In [33]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[1091,  252],
       [ 281, 1055]], dtype=int64)

In [34]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.80      0.81      0.79      0.80      0.80      0.64      1343
          1       0.81      0.79      0.81      0.80      0.80      0.64      1336

avg / total       0.80      0.80      0.80      0.80      0.80      0.64      2679

