In [20]:
# import libraries
import requests
import pandas as pd
import numpy as np
from pandas import json_normalize 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# for regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# for metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score 

# for plots 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import pairplot

# model save/load
import pickle

In [116]:
# create function for creating custom data frame for past seasons to train the model
def custom_dataframe(season, current_season_games_dataframe):
    # create new data frame
    current_season_dataframe =  pd.DataFrame()
    
    
    
    # loop through data frame of games to construct new dataframe for season stats
    for index, row in current_season_games_dataframe.iterrows():
        # ---------------------------------------update each teams score per week------------------------------------------
        current_season_dataframe.loc[row.HomeTeam,row.Week] = row.HomeScore
        current_season_dataframe.loc[row.AwayTeam,row.Week] = row.AwayScore

        current_season_dataframe[row.Week] = current_season_dataframe[row.Week].fillna(0)

        # ---------------------------------------update each teams games played -------------------------------------------
        if 'GamesPlayed' not in current_season_dataframe.columns:
            current_season_dataframe.loc[row.HomeTeam,'GamesPlayed'] = 0
            current_season_dataframe.loc[row.AwayTeam,'GamesPlayed'] = 0

        # fill null values with 0 because as scores are appended the column will be NaN
        current_season_dataframe['GamesPlayed'] = current_season_dataframe['GamesPlayed'].fillna(0)
        # add score of that week to total score
        current_season_dataframe.loc[row.HomeTeam,'GamesPlayed'] += 1
        current_season_dataframe.loc[row.AwayTeam,'GamesPlayed'] += 1


        # ----------------------------------------------update score to week------------------------------------------------
        if 'TotalScore' not in current_season_dataframe.columns:
            current_season_dataframe.loc[row.HomeTeam,'TotalScore'] = 0
            current_season_dataframe.loc[row.AwayTeam,'TotalScore'] = 0

        # fill null values with 0 because as scores are appended the column will be NaN
        current_season_dataframe['TotalScore'] = current_season_dataframe['TotalScore'].fillna(0)
        # add score of that week to total score
        current_season_dataframe.loc[row.HomeTeam,'TotalScore'] += row.HomeScore 
        current_season_dataframe.loc[row.AwayTeam,'TotalScore'] += row.AwayScore

        # get avg points scored up to a certain week
        update_avg_home_score = current_season_dataframe.loc[row.HomeTeam,'TotalScore'] / current_season_dataframe.loc[row.HomeTeam,'GamesPlayed']
        update_avg_away_score = current_season_dataframe.loc[row.AwayTeam,'TotalScore'] / current_season_dataframe.loc[row.AwayTeam,'GamesPlayed']
        current_season_dataframe.loc[row.HomeTeam,'avg_up_to_week_{0}'.format(row.Week + 1)] = update_avg_home_score 
        current_season_dataframe.loc[row.AwayTeam,'avg_up_to_week_{0}'.format(row.Week + 1)] = update_avg_away_score

        current_season_dataframe['avg_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['avg_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['TotalScore'] / (row.Week))

        # ----------------------------------- store wins per team up to a certain week ----------------------------------
        if 'TotalWins' not in current_season_dataframe.columns:
            current_season_dataframe.loc[row.HomeTeam,'TotalWins'] = 0
            current_season_dataframe.loc[row.AwayTeam,'TotalWins'] = 0

        # fill null values with 0 because as scores are appended the column will be NaN
        current_season_dataframe['TotalWins'] = current_season_dataframe['TotalWins'].fillna(0)
        # check who won
        if row.HomeScore > row.AwayScore:
            current_season_dataframe.loc[row.HomeTeam,'TotalWins'] += 1
        else:
            current_season_dataframe.loc[row.AwayTeam,'TotalWins'] += 1


        current_season_dataframe.loc[row.HomeTeam,'wins_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe.loc[row.HomeTeam,'TotalWins']
        current_season_dataframe.loc[row.AwayTeam,'wins_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe.loc[row.AwayTeam,'TotalWins']

        if row.Week >= 4:
            current_season_dataframe['wins_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['wins_up_to_week_{0}'.format(row.Week + 1)].fillna(0)
        # ----------------------------------- store first downs per team up to certain week ----------------------------------

        if 'TotalFirstDowns' not in current_season_dataframe.columns:
            current_season_dataframe.loc[row.HomeTeam,'TotalFirstDowns'] = 0
            current_season_dataframe.loc[row.AwayTeam,'TotalFirstDowns'] = 0

        # fill null values with 0 because as scores are appended the column will be NaN
        current_season_dataframe['TotalFirstDowns'] = current_season_dataframe['TotalFirstDowns'].fillna(0)
        # add score of that week to total score
        current_season_dataframe.loc[row.HomeTeam,'TotalFirstDowns'] += row.HomeFirstDowns 
        current_season_dataframe.loc[row.AwayTeam,'TotalFirstDowns'] += row.AwayFirstDowns

        # get avg points scored up to a certain week
        update_first_downs_home = current_season_dataframe.loc[row.HomeTeam,'TotalFirstDowns'] / current_season_dataframe.loc[row.HomeTeam,'GamesPlayed']
        update_first_downs_away = current_season_dataframe.loc[row.AwayTeam,'TotalFirstDowns'] / current_season_dataframe.loc[row.AwayTeam,'GamesPlayed']
        current_season_dataframe.loc[row.HomeTeam,'first_downs_up_to_week_{0}'.format(row.Week + 1)] = update_first_downs_home 
        current_season_dataframe.loc[row.AwayTeam,'first_downs_up_to_week_{0}'.format(row.Week + 1)] = update_first_downs_away

        current_season_dataframe['first_downs_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['first_downs_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['TotalFirstDowns'] / (row.Week))

        # ----------------------------------- calculate third down % per team up to certain week ----------------------------------

        if 'ThirdDownPercentage' not in current_season_dataframe.columns:
            current_season_dataframe.loc[row.HomeTeam,'ThirdDownPercentage'] = 0.0
            current_season_dataframe.loc[row.AwayTeam,'ThirdDownPercentage'] = 0.0

        # fill null values with 0 because as scores are appended the column will be NaN
        current_season_dataframe['ThirdDownPercentage'] = current_season_dataframe['ThirdDownPercentage'].fillna(0.0)
        # add score of that week to total score
        current_season_dataframe.loc[row.HomeTeam,'ThirdDownPercentage'] += row.HomeThirdDownPercentage 
        current_season_dataframe.loc[row.AwayTeam,'ThirdDownPercentage'] += row.AwayThirdDownPercentage

        # get avg points scored up to a certain week
        update_third_down_percentage_home = current_season_dataframe.loc[row.HomeTeam,'ThirdDownPercentage'] / current_season_dataframe.loc[row.HomeTeam,'GamesPlayed']
        update_third_down_percentage_away = current_season_dataframe.loc[row.AwayTeam,'ThirdDownPercentage'] / current_season_dataframe.loc[row.AwayTeam,'GamesPlayed']
        current_season_dataframe.loc[row.HomeTeam,'third_down_percentage_up_to_week_{0}'.format(row.Week + 1)] = update_third_down_percentage_home
        current_season_dataframe.loc[row.AwayTeam,'third_down_percentage_up_to_week_{0}'.format(row.Week + 1)] = update_third_down_percentage_away

        current_season_dataframe['third_down_percentage_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['third_down_percentage_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['ThirdDownPercentage'] / (row.Week))


        # ----------------------------------- calculate average time of possession per team ----------------------------------

        away_top = row.AwayTimeOfPossession.split(':')
        home_top = row.HomeTimeOfPossession.split(':')
        away_top = float('{0}.{1}'.format(away_top[0],away_top[1]))
        home_top = float('{0}.{1}'.format(home_top[0],home_top[1]))


        if 'TotalTimeOfPossession' not in current_season_dataframe.columns:
            current_season_dataframe.loc[row.HomeTeam,'TotalTimeOfPossession'] = 0.0
            current_season_dataframe.loc[row.AwayTeam,'TotalTimeOfPossession'] = 0.0

        # fill null values with 0 because as scores are appended the column will be NaN
        current_season_dataframe['TotalTimeOfPossession'] = current_season_dataframe['TotalTimeOfPossession'].fillna(0.0)
        # add score of that week to total score
        current_season_dataframe.loc[row.HomeTeam,'TotalTimeOfPossession'] += home_top
        current_season_dataframe.loc[row.AwayTeam,'TotalTimeOfPossession'] += away_top

        # get avg points scored up to a certain week
        update_time_of_possession_home = current_season_dataframe.loc[row.HomeTeam,'TotalTimeOfPossession'] / current_season_dataframe.loc[row.HomeTeam,'GamesPlayed']
        update_time_of_possession_away = current_season_dataframe.loc[row.AwayTeam,'TotalTimeOfPossession'] / current_season_dataframe.loc[row.AwayTeam,'GamesPlayed']
        current_season_dataframe.loc[row.HomeTeam,'time_of_possession_up_to_week_{0}'.format(row.Week + 1)] = update_time_of_possession_home
        current_season_dataframe.loc[row.AwayTeam,'time_of_possession_up_to_week_{0}'.format(row.Week + 1)] = update_time_of_possession_away

        current_season_dataframe['time_of_possession_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['time_of_possession_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['TotalTimeOfPossession'] / (row.Week ))



    # Sliced DataFrame:Df by column name
    current_season_games_dataframe = current_season_games_dataframe[["HomeTeam", "AwayTeam", "HomeScore", "AwayScore", "Week"]]

    # Engineered two columns
    current_season_games_dataframe["HomeResult"] = np.where(current_season_games_dataframe["HomeScore"] > current_season_games_dataframe["AwayScore"], 1, 0)
    current_season_games_dataframe["AwayResult"] = np.where(current_season_games_dataframe["AwayScore"] > current_season_games_dataframe["HomeScore"], 1, 0)
    
    # remove week 1 games because companies are greedy with their data
    current_season_games_dataframe = current_season_games_dataframe[current_season_games_dataframe.Week > 1]


    for index, row in current_season_games_dataframe.iterrows():
        current_season_games_dataframe.at[index, 'AwayAverage'] = current_season_dataframe.loc[row.AwayTeam,'avg_up_to_week_{0}'.format(row.Week)] if 'avg_up_to_week_{0}'.format(row.Week) in current_season_dataframe.columns else 0.0
        current_season_games_dataframe.at[index, 'HomeAverage'] = current_season_dataframe.loc[row.HomeTeam,'avg_up_to_week_{0}'.format(row.Week)] if 'avg_up_to_week_{0}'.format(row.Week) in current_season_dataframe.columns else 0.0

        current_season_games_dataframe.at[index, 'AwayWins'] = current_season_dataframe.loc[row.AwayTeam,'wins_up_to_week_{0}'.format(row.Week)] if 'wins_up_to_week_{0}'.format(row.Week) in current_season_dataframe.columns else 0.0
        current_season_games_dataframe.at[index, 'HomeWins'] = current_season_dataframe.loc[row.HomeTeam,'wins_up_to_week_{0}'.format(row.Week)] if 'wins_up_to_week_{0}'.format(row.Week) in current_season_dataframe.columns else 0.0

        current_season_games_dataframe.at[index, 'AwayFirstDowns'] = current_season_dataframe.loc[row.AwayTeam,'first_downs_up_to_week_{0}'.format(row.Week)] if 'first_downs_up_to_week_{0}'.format(row.Week) in current_season_dataframe.columns else 0.0
        current_season_games_dataframe.at[index, 'HomeFirstDowns'] = current_season_dataframe.loc[row.HomeTeam,'first_downs_up_to_week_{0}'.format(row.Week)] if 'first_downs_up_to_week_{0}'.format(row.Week) in current_season_dataframe.columns else 0.0

        current_season_games_dataframe.at[index, 'AwayTime'] = current_season_dataframe.loc[row.AwayTeam,'time_of_possession_up_to_week_{0}'.format(row.Week)] if 'time_of_possession_up_to_week_{0}'.format(row.Week) in current_season_dataframe.columns else 0.0
        current_season_games_dataframe.at[index, 'HomeTime'] = current_season_dataframe.loc[row.HomeTeam,'time_of_possession_up_to_week_{0}'.format(row.Week)] if 'time_of_possession_up_to_week_{0}'.format(row.Week) in current_season_dataframe.columns else 0.0

        current_season_games_dataframe.at[index, 'AwayThirdDowns'] = current_season_dataframe.loc[row.AwayTeam,'third_down_percentage_up_to_week_{0}'.format(row.Week)] if 'third_down_percentage_up_to_week_{0}'.format(row.Week) in current_season_dataframe.columns else 0.0
        current_season_games_dataframe.at[index, 'HomeThirdDowns'] = current_season_dataframe.loc[row.HomeTeam,'third_down_percentage_up_to_week_{0}'.format(row.Week)] if 'third_down_percentage_up_to_week_{0}'.format(row.Week) in current_season_dataframe.columns else 0.0

    return current_season_games_dataframe


In [130]:
# create list of seasons
seasons = [2019,2020]
season_games_dataframes = {}
# season_stats_dataframes = {}


# load dataframe from api
for season in seasons:

    # get individual games stats
    response = requests.get('https://api.sportsdata.io/v3/nfl/scores/json/GameStats/{0}REG?key=d8b5ea01537141eb9a320f95994b7109'.format(str(season)))
    season_games_dataframes[season] = pd.DataFrame.from_dict(response.json())

#     # get season stats
#     response = requests.get('https://api.sportsdata.io/v3/nfl/scores/json/TeamSeasonStats/{0}REG?key=d8b5ea01537141eb9a320f95994b7109'.format(str(season)))
#     season_stats_dataframes[season] = pd.DataFrame.from_dict(response.json())


In [131]:
# turn api dataframe into custom dataframe
for season in seasons:
    season_games_dataframes[season] = custom_dataframe(season, season_games_dataframes[season])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_season_games_dataframe["HomeResult"] = np.where(current_season_games_dataframe["HomeScore"] > current_season_games_dataframe["AwayScore"], 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_season_games_dataframe["AwayResult"] = np.where(current_season_games_dataframe["AwayScore"] > current_season_games_dataframe["HomeScore"], 1, 0)


In [135]:
# model creation

# get desired data
selected_features = ['AwayAverage', 'AwayWins', 'AwayFirstDowns', 'AwayTime', 'AwayThirdDowns', 'HomeAverage', 'HomeWins', 'HomeFirstDowns', 'HomeTime', 'HomeThirdDowns']
target = ['HomeResult']

# create model
model = LogisticRegression()

# train model with all past data available
for key, df in season_games_dataframes.items():
    X = df[selected_features].values
    y = df[target].values

    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)
    
    # train the model with available data
    model.fit(X_train, y_train)
    
    # predict on test set
    y_pred = model.predict(X_test)
    y_pred
    
    # check model metrics
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy Score: %f" % accuracy)


    precision = precision_score(y_test, y_pred)
    print("Precision Score: %f" % precision)

    recall = recall_score(y_test, y_pred)
    print("Recall Score: %f" % recall)

    print(f1_score(y_test, y_pred))
    

# save the model
with open('nfl_predictor.pkl','wb') as f:
    pickle.dump(model,f)


Accuracy Score: 0.541667
Precision Score: 0.538462
Recall Score: 0.583333
0.5599999999999999
Accuracy Score: 0.566667
Precision Score: 0.625000
Recall Score: 0.588235
0.6060606060606061


  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(**kwargs)
