In [49]:
# import libraries
import os
import requests
import pandas as pd
import numpy as np
from pandas import json_normalize 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# for regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# for metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score 

# for plots 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import pairplot


# Random Forest model
from sklearn.ensemble import RandomForestClassifier

# model save/load
import pickle

In [50]:
# bring in all training data from csv
# dataframe to combine all dataframes
all_data = pd.DataFrame()
for file in os.listdir("./data/custom_games_by_season"):
    df = pd.read_csv('./data/custom_games_by_season/{0}'.format(file))
    df = df.drop(columns=['Unnamed: 0'])   
    all_data = all_data.append(df, ignore_index = True) 
    
# remove null columns (no idea where they come from)
all_data = all_data.dropna(axis=1, how='all')
all_data

Unnamed: 0,HomeTeam,AwayTeam,HomeScore,AwayScore,HomeResult,AwayResult,AwayAverage,HomeAverage,AwayWins,HomeWins,AwayFirstDowns,HomeFirstDowns,AwayTime,HomeTime,AwayThirdDowns,HomeThirdDowns
0,CIN,HOU,9.0,13.0,0,1,7.0,0.0,0.0,0.0,23.0,14.0,29.55,26.0,41.7,30.8
1,BAL,CLE,24.0,10.0,1,0,18.0,20.0,0.0,1.0,20.0,17.0,31.1,34.0,25.0,42.9
2,CAR,BUF,9.0,3.0,1,0,21.0,23.0,1.0,1.0,23.0,20.0,33.4,34.21,47.1,53.8
3,IND,ARI,13.0,16.0,0,1,23.0,9.0,0.0,0.0,24.0,10.0,26.4,25.35,40.0,0.0
4,JAX,TEN,16.0,37.0,0,1,16.0,29.0,0.0,1.0,21.0,19.0,28.21,30.5,50.0,25.0
5,KC,PHI,27.0,20.0,1,0,30.0,42.0,1.0,1.0,19.0,26.0,34.16,30.14,57.1,36.4
6,NO,NE,20.0,36.0,0,1,27.0,19.0,0.0,0.0,25.0,19.0,29.46,28.44,33.3,36.4
7,PIT,MIN,26.0,9.0,1,0,29.0,21.0,1.0,1.0,23.0,16.0,31.16,28.5,64.3,38.5
8,TB,CHI,29.0,7.0,1,0,17.0,0.0,0.0,0.0,20.0,0.0,29.26,0.0,38.5,0.0
9,LAC,MIA,17.0,19.0,0,1,0.0,21.0,0.0,0.0,0.0,17.0,0.0,25.47,0.0,25.0


In [51]:
# model creation

# get desired data
selected_features = ['AwayAverage', 'AwayWins', 'AwayFirstDowns', 'AwayTime', 'AwayThirdDowns', 'HomeAverage', 'HomeWins', 'HomeFirstDowns', 'HomeTime', 'HomeThirdDowns']
target = ['HomeResult']

# create model
model = LogisticRegression()


# train model with all data available
X = all_data[selected_features].values
y = all_data[target].values

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.23, random_state=15)

# train the model with available data
model.fit(X_train, y_train)

# predict on test set
y_pred = model.predict(X_test)
y_pred

# check model metrics
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score: %f" % accuracy)


precision = precision_score(y_test, y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_test, y_pred)
print("Recall Score: %f" % recall)

print(f1_score(y_test, y_pred))


# save the model
with open('nfl_predictor_lr.pkl','wb') as f:
    pickle.dump(model,f)


Accuracy Score: 0.504587
Precision Score: 0.514085
Recall Score: 0.651786
0.5748031496062992


  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
# try random forest
# model creation


# create model 
model = RandomForestClassifier()


# train the model with available data
model.fit(X_train, y_train)

# predict on test set
y_pred = model.predict(X_test)
y_pred

# check model metrics
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score: %f" % accuracy)


precision = precision_score(y_test, y_pred)
print("Precision Score: %f" % precision)

recall = recall_score(y_test, y_pred)
print("Recall Score: %f" % recall)

print(f1_score(y_test, y_pred))


# save the model
with open('nfl_predictor_rf.pkl','wb') as f:
    pickle.dump(model,f)


  model.fit(X_train, y_train)


Accuracy Score: 0.555046
Precision Score: 0.553191
Recall Score: 0.696429
0.6166007905138341
