In [10]:
# import libraries
import os
import requests
import pandas as pd
import numpy as np
from pandas import json_normalize 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# for regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# for metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score 

# for plots 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import pairplot

# model save/load
import pickle

In [11]:
# get api key
api_key = os.environ['SPORTS_DATA_IO_API']

In [12]:
def custom_regular_season_dataframe(season):
    # get custom dataframes
    if season != 2020:
        weeks = 17
    else:
        weeks = 12
        
    # load current season data    
    df_current_season = pd.read_csv('./data/season_team_data/{0}_data.csv'.format(str(season)))
    
    
    # bool for loading in past season
    previous_season_exists = False
    # load in the past seasons data if it exists
    try:
        df_past_season = pd.read_csv('./data/season_team_data/{0}_data.csv'.format(str(season - 1)))
        df_past_season = df_past_season.set_index('Team')
    except:
        previous_season_exists=False
    else:
        previous_season_exists=True

    
        
    # create new data frame without week 1 columns
    columns=['TotalScore', 'TotalFirstDowns', 'ThirdDownPercentage', 'TotalTimeOfPossession', 'GamesPlayed']
    current_season_dataframe =  pd.DataFrame(index=df_current_season['Team'], columns=columns)
    current_season_dataframe = current_season_dataframe.fillna(0)

    # populate week 1 averages with data from previous season
    if previous_season_exists:
        for team, row in current_season_dataframe.iterrows():
            current_season_dataframe.at[team, 'avg_up_to_week_1'] = df_past_season.loc[team,'Score'] / 16
            current_season_dataframe.at[team, 'first_downs_up_to_week_1'] = df_past_season.loc[team,'FirstDowns'] / 16
            current_season_dataframe.at[team, 'third_down_percentage_up_to_week_1'] = df_past_season.loc[team,'ThirdDownPercentage'] 
            
            # convert string time of possession to float
            top = df_past_season.loc[team,'TimeOfPossession'].split(':')
            top = float('{0}.{1}'.format(top[0],top[1]))
            current_season_dataframe.at[team, 'time_of_possession_up_to_week_1'] = top
    
    # loop through each regular seasons weeks games
    for week in range(1, weeks + 1):
        df_week_data = pd.read_csv('./data/season_game_data/{0}/week_{1}_data.csv'.format(season, week))
        for _, row in df_week_data.iterrows():
            # ---------------------------------------update each teams score per week------------------------------------------
            current_season_dataframe.loc[row.Team,row.Week] = row.Score
            current_season_dataframe[row.Week] = current_season_dataframe[row.Week].fillna(0)
            
            # ---------------------------------------update each teams games played -------------------------------------------
            current_season_dataframe.loc[row.Team,'GamesPlayed'] += 1
            
            # ----------------------------------------------update score to week------------------------------------------------
            current_season_dataframe.loc[row.Team,'TotalScore'] += row.Score 
            
            update_avg_score = current_season_dataframe.loc[row.Team,'TotalScore'] / current_season_dataframe.loc[row.Team,'GamesPlayed']
            
            current_season_dataframe.loc[row.Team,'avg_up_to_week_{0}'.format(row.Week + 1)] = update_avg_score 
            
            current_season_dataframe['avg_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['avg_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['TotalScore'] / current_season_dataframe.loc[row.Team,'GamesPlayed'])


            # ----------------------------------- store first downs per team up to certain week ----------------------------------
            current_season_dataframe.loc[row.Team,'TotalFirstDowns'] += row.FirstDowns 
                    
            update_first_downs = current_season_dataframe.loc[row.Team,'TotalFirstDowns'] / current_season_dataframe.loc[row.Team,'GamesPlayed']
            current_season_dataframe.loc[row.Team,'first_downs_up_to_week_{0}'.format(row.Week + 1)] = update_first_downs
            
            current_season_dataframe['first_downs_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['first_downs_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['TotalFirstDowns'] / current_season_dataframe.loc[row.Team,'GamesPlayed'])

            # ----------------------------------- calculate third down % per team up to certain week ----------------------------------
            current_season_dataframe.loc[row.Team,'ThirdDownPercentage'] += row.ThirdDownPercentage 
    
            update_third_down_percentage = current_season_dataframe.loc[row.Team,'ThirdDownPercentage'] / current_season_dataframe.loc[row.Team,'GamesPlayed']

            current_season_dataframe.loc[row.Team,'third_down_percentage_up_to_week_{0}'.format(row.Week + 1)] = update_third_down_percentage

        
            current_season_dataframe['third_down_percentage_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['third_down_percentage_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['ThirdDownPercentage'] / current_season_dataframe.loc[row.Team,'GamesPlayed'])

            # ----------------------------------- calculate average time of possession per team ----------------------------------
            team_top = float('{0}.{1}'.format(row.TimeOfPossessionMinutes,row.TimeOfPossessionSeconds))
            
            current_season_dataframe.loc[row.Team,'TotalTimeOfPossession'] += team_top
            
            update_time_of_possession = current_season_dataframe.loc[row.Team,'TotalTimeOfPossession'] / current_season_dataframe.loc[row.Team,'GamesPlayed']
            
            current_season_dataframe.loc[row.Team,'time_of_possession_up_to_week_{0}'.format(row.Week + 1)] = update_time_of_possession

            current_season_dataframe['time_of_possession_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['time_of_possession_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['TotalTimeOfPossession'] / current_season_dataframe.loc[row.Team,'GamesPlayed'])
        

    return current_season_dataframe
    
    

In [13]:
def custom_post_season_dataframe(season):
    # load that seasons end of season data
    df_current_season = pd.read_csv('./data/custom_season_team_data/{0}_data.csv'.format(str(season)))
    
    
    columns=['TotalScore', 'TotalFirstDowns', 'ThirdDownPercentage', 'TotalTimeOfPossession', 'GamesPlayed']
    current_season_dataframe =  pd.DataFrame(index=df_current_season['Team'], columns=columns)
    current_season_dataframe = current_season_dataframe.fillna(0)

    
    # set index to be team name
    df_current_season = df_current_season.set_index('Team')
    
    # populate week 1 averages with data from previous season
    for team, row in current_season_dataframe.iterrows():
        current_season_dataframe.at[team, 'avg_up_to_week_1_playoffs'] = df_current_season.loc[team,'avg_up_to_week_18']
        current_season_dataframe.at[team, 'first_downs_up_to_week_1_playoffs'] = df_current_season.loc[team,'first_downs_up_to_week_18']
        current_season_dataframe.at[team, 'third_down_percentage_up_to_week_1_playoffs'] = df_current_season.loc[team,'third_down_percentage_up_to_week_18'] 
        current_season_dataframe.at[team, 'time_of_possession_up_to_week_1_playoffs'] = df_current_season.loc[team,'time_of_possession_up_to_week_18']
        
        '''
        setting this specifically for bye teams, as the values they have will be NaN,
        we need their week 2 value to be the same as week 1
        all other teams that play a game will have it overwritten
        '''
        current_season_dataframe.at[team, 'avg_up_to_week_2_playoffs'] = df_current_season.loc[team,'avg_up_to_week_18']
        current_season_dataframe.at[team, 'first_downs_up_to_week_2_playoffs'] = df_current_season.loc[team,'first_downs_up_to_week_18']
        current_season_dataframe.at[team, 'third_down_percentage_up_to_week_2_playoffs'] = df_current_season.loc[team,'third_down_percentage_up_to_week_18'] 
        current_season_dataframe.at[team, 'time_of_possession_up_to_week_2_playoffs'] = df_current_season.loc[team,'time_of_possession_up_to_week_18']


    
    # loop through each post seasons weeks games
    if season != 2020:
        for week in range(1, 5):
            df_week_data = pd.read_csv('./data/postseason_game_data/{0}/week_{1}_data.csv'.format(season, week))
            
            for _, row in df_week_data.iterrows():
                # ---------------------------------------update each teams score per week------------------------------------------
                current_season_dataframe.loc[row.Team,row.Week] = row.Score
                current_season_dataframe[row.Week] = current_season_dataframe[row.Week].fillna(0.0)

                # ---------------------------------------update each teams games played -------------------------------------------
                current_season_dataframe.loc[row.Team,'GamesPlayed'] += 1

                # ----------------------------------------------update score to week------------------------------------------------
                current_season_dataframe.loc[row.Team,'TotalScore'] += row.Score 

                update_avg_score = current_season_dataframe.loc[row.Team,'TotalScore'] / current_season_dataframe.loc[row.Team,'GamesPlayed']

                current_season_dataframe.loc[row.Team,'avg_up_to_week_{0}_playoffs'.format(row.Week + 1)] = update_avg_score 

                if row.Week > 1:
                    current_season_dataframe['avg_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['avg_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(0.0)
                else:
                    current_season_dataframe['avg_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['avg_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(current_season_dataframe.loc[row.Team,'avg_up_to_week_{0}_playoffs'.format(row.Week)])

  
                # ----------------------------------- store first downs per team up to certain week ----------------------------------
                current_season_dataframe.loc[row.Team,'TotalFirstDowns'] += row.FirstDowns 

                update_first_downs = current_season_dataframe.loc[row.Team,'TotalFirstDowns'] / current_season_dataframe.loc[row.Team,'GamesPlayed']
                current_season_dataframe.loc[row.Team,'first_downs_up_to_week_{0}_playoffs'.format(row.Week + 1)] = update_first_downs
                
                if row.Week > 1:
                    current_season_dataframe['first_downs_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['first_downs_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(0)
                else:
                    current_season_dataframe['first_downs_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['first_downs_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(current_season_dataframe.loc[row.Team,'first_downs_up_to_week_{0}_playoffs'.format(row.Week)])

                # ----------------------------------- calculate third down % per team up to certain week ----------------------------------
                current_season_dataframe.loc[row.Team,'ThirdDownPercentage'] += row.ThirdDownPercentage 

                update_third_down_percentage = current_season_dataframe.loc[row.Team,'ThirdDownPercentage'] / current_season_dataframe.loc[row.Team,'GamesPlayed']

                current_season_dataframe.loc[row.Team,'third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week + 1)] = update_third_down_percentage

                if row.Week > 1:
                    current_season_dataframe['third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(0)
                else:
                    current_season_dataframe['third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(current_season_dataframe.loc[row.Team,'third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week)])

                # ----------------------------------- calculate average time of possession per team ----------------------------------
                team_top = float('{0}.{1}'.format(row.TimeOfPossessionMinutes,row.TimeOfPossessionSeconds))

                current_season_dataframe.loc[row.Team,'TotalTimeOfPossession'] += team_top

                update_time_of_possession = current_season_dataframe.loc[row.Team,'TotalTimeOfPossession'] / current_season_dataframe.loc[row.Team,'GamesPlayed']

                current_season_dataframe.loc[row.Team,'time_of_possession_up_to_week_{0}_playoffs'.format(row.Week + 1)] = update_time_of_possession
                
                if row.Week > 1:
                    current_season_dataframe['time_of_possession_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['time_of_possession_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(0)
                else:
                    current_season_dataframe['time_of_possession_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['time_of_possession_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(current_season_dataframe.loc[row.Team,'time_of_possession_up_to_week_{0}_playoffs'.format(row.Week)])
        return current_season_dataframe

In [14]:
# function for custom matchup dataframes
def custom_season_games_dataframes(season):
    # load that whole seasons data
    df_regular_season = pd.read_csv('./data/scores_by_regular_season/{0}_data.csv'.format(str(season)))
    df_custom_regular_season = pd.read_csv('./data/custom_season_team_data/{0}_data.csv'.format(str(season)))
    df_custom_regular_season = df_custom_regular_season.set_index('Team')
   
    if season == 2017:
        df_regular_season = df_regular_season[df_regular_season.Week > 1]
        
    if season == 2020:
        # haven't played all 17 weeks yet
        df_regular_season = df_regular_season[df_regular_season.Week < 12]
        
        # get rid of covid cancelled games
        df_regular_season = df_regular_season.dropna(how='any',axis=0) 
        
    # get desired columns   
    df_regular_season = df_regular_season[["HomeTeam", "AwayTeam", "HomeScore", "AwayScore", "Week"]]
    
    # Engineered two columns
    df_regular_season["HomeResult"] = np.where(df_regular_season["HomeScore"] > df_regular_season["AwayScore"], 1, 0)
    df_regular_season["AwayResult"] = np.where(df_regular_season["AwayScore"] > df_regular_season["HomeScore"], 1, 0)

    # add required data for each matchup
    for index, row in df_regular_season.iterrows():
        df_regular_season.at[index, 'AwayAverage'] = df_custom_regular_season.loc[row.AwayTeam,'avg_up_to_week_{0}'.format(row.Week)] 
        df_regular_season.at[index, 'HomeAverage'] = df_custom_regular_season.loc[row.HomeTeam,'avg_up_to_week_{0}'.format(row.Week)] 

        df_regular_season.at[index, 'AwayFirstDowns'] = df_custom_regular_season.loc[row.AwayTeam,'first_downs_up_to_week_{0}'.format(row.Week)] 
        df_regular_season.at[index, 'HomeFirstDowns'] = df_custom_regular_season.loc[row.HomeTeam,'first_downs_up_to_week_{0}'.format(row.Week)] 

        df_regular_season.at[index, 'AwayTime'] = df_custom_regular_season.loc[row.AwayTeam,'time_of_possession_up_to_week_{0}'.format(row.Week)] 
        df_regular_season.at[index, 'HomeTime'] = df_custom_regular_season.loc[row.HomeTeam,'time_of_possession_up_to_week_{0}'.format(row.Week)] 

        df_regular_season.at[index, 'AwayThirdDowns'] = df_custom_regular_season.loc[row.AwayTeam,'third_down_percentage_up_to_week_{0}'.format(row.Week)] 
        df_regular_season.at[index, 'HomeThirdDowns'] = df_custom_regular_season.loc[row.HomeTeam,'third_down_percentage_up_to_week_{0}'.format(row.Week)] 
     
    
    # save csv
    df_regular_season.to_csv('./data/custom_games_by_season/{0}_data.csv'.format(str(season)), header=True,  encoding='utf-8', index=False) 

    # save version with no results
    df_regular_season = df_regular_season.drop(columns=['HomeResult','AwayResult','HomeScore','AwayScore'])
    df_regular_season.to_csv('./data/custom_games_by_season_no_results/{0}_data.csv'.format(str(season)), header=True,  encoding='utf-8', index=False) 

    # in case post season doesn't exist (2020)
    if season != 2020:
        # load that whole post season data
        df_post_season = pd.read_csv('./data/scores_by_post_season/{0}_data.csv'.format(str(season)))
        df_custom_post_season = pd.read_csv('./data/custom_post_season_team_data/{0}_data.csv'.format(str(season)))
        
        df_custom_post_season = df_custom_post_season.set_index('Team')
        
        # get desired columns
        df_post_season = df_post_season[["HomeTeam", "AwayTeam", "HomeScore", "AwayScore", "Week"]]
        
        # Engineered two columns
        df_post_season["HomeResult"] = np.where(df_post_season["HomeScore"] > df_post_season["AwayScore"], 1, 0)
        df_post_season["AwayResult"] = np.where(df_post_season["AwayScore"] > df_post_season["HomeScore"], 1, 0)

        
        for index, row in df_post_season.iterrows():
            df_post_season.at[index, 'AwayAverage'] = df_custom_post_season.loc[row.AwayTeam,'avg_up_to_week_{0}_playoffs'.format(row.Week)] 
            df_post_season.at[index, 'HomeAverage'] = df_custom_post_season.loc[row.HomeTeam,'avg_up_to_week_{0}_playoffs'.format(row.Week)] 

            df_post_season.at[index, 'AwayFirstDowns'] = df_custom_post_season.loc[row.AwayTeam,'first_downs_up_to_week_{0}_playoffs'.format(row.Week)] 
            df_post_season.at[index, 'HomeFirstDowns'] = df_custom_post_season.loc[row.HomeTeam,'first_downs_up_to_week_{0}_playoffs'.format(row.Week)] 

            df_post_season.at[index, 'AwayTime'] = df_custom_post_season.loc[row.AwayTeam,'time_of_possession_up_to_week_{0}_playoffs'.format(row.Week)] 
            df_post_season.at[index, 'HomeTime'] = df_custom_post_season.loc[row.HomeTeam,'time_of_possession_up_to_week_{0}_playoffs'.format(row.Week)] 

            df_post_season.at[index, 'AwayThirdDowns'] = df_custom_post_season.loc[row.AwayTeam,'third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week)] 
            df_post_season.at[index, 'HomeThirdDowns'] = df_custom_post_season.loc[row.HomeTeam,'third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week)] 
    
        # save csv
        df_post_season.to_csv('./data/custom_postseason_games_by_season/{0}_data.csv'.format(str(season)), header=True,  encoding='utf-8', index=False) 
        
        # save version with no results
        df_post_season = df_post_season.drop(columns=['HomeResult','AwayResult','HomeScore','AwayScore'])
        df_post_season.to_csv('./data/custom_postseason_games_by_season_no_results/{0}_data.csv'.format(str(season)), header=True,  encoding='utf-8', index=False) 

                                                                                                  

    

In [367]:
def create_custom_season_data_pipeline():
    seasons = [2017, 2018,2019,2020]


    for season in seasons:
        # put regular season stats in csv
        df = custom_regular_season_dataframe(season)
        df.to_csv('./data/custom_season_team_data/{0}_data.csv'.format(str(season)), header=True,  encoding='utf-8') 

        # put post season stats in csv
        if season != 2020:
            df = custom_post_season_dataframe(season)
            df.to_csv('./data/custom_post_season_team_data/{0}_data.csv'.format(str(season)), header=True,  encoding='utf-8') 
        
        custom_season_games_dataframes(season)


In [24]:
def add_this_weeks_games(season):
    # get latest weeks games
    # get seasons schedule data
    df_season = pd.read_csv('./data/custom_games_by_season/{0}_data.csv'.format(str(season)))

    # get custom season data
    df_custom_regular_season = pd.read_csv('./data/custom_season_team_data/{0}_data.csv'.format(str(season)))
    df_custom_regular_season = df_custom_regular_season.set_index('Team')
    # get latest weeks games
    response = requests.get('https://api.sportsdata.io/v3/nfl/scores/json/CurrentWeek?key=d8b5ea01537141eb9a320f95994b7109')
    week = response.json()

    response = requests.get('https://api.sportsdata.io/api/nfl/odds/json/ScoresByWeek/2020REG/{0}?key={1}'.format(week,api_key))
    games_this_week = pd.DataFrame.from_dict(response.json())
    games_this_week = games_this_week[['Week','AwayTeam','HomeTeam', 'AwayScore','HomeScore']]

    for index, row in games_this_week.iterrows():
            games_this_week.at[index, 'AwayAverage'] = df_custom_regular_season.loc[row.AwayTeam,'avg_up_to_week_{0}'.format(week)] 
            games_this_week.at[index, 'HomeAverage'] = df_custom_regular_season.loc[row.HomeTeam,'avg_up_to_week_{0}'.format(week)] 

            games_this_week.at[index, 'AwayFirstDowns'] = df_custom_regular_season.loc[row.AwayTeam,'first_downs_up_to_week_{0}'.format(week)] 
            games_this_week.at[index, 'HomeFirstDowns'] = df_custom_regular_season.loc[row.HomeTeam,'first_downs_up_to_week_{0}'.format(week)] 

            games_this_week.at[index, 'AwayTime'] = df_custom_regular_season.loc[row.AwayTeam,'time_of_possession_up_to_week_{0}'.format(week)] 
            games_this_week.at[index, 'HomeTime'] = df_custom_regular_season.loc[row.HomeTeam,'time_of_possession_up_to_week_{0}'.format(week)] 

            games_this_week.at[index, 'AwayThirdDowns'] = df_custom_regular_season.loc[row.AwayTeam,'third_down_percentage_up_to_week_{0}'.format(week)] 
            games_this_week.at[index, 'HomeThirdDowns'] = df_custom_regular_season.loc[row.HomeTeam,'third_down_percentage_up_to_week_{0}'.format(week)] 
            
    # calculate results        
    games_this_week["HomeResult"] = np.where(games_this_week["HomeScore"] > games_this_week["AwayScore"], 1, 0)
    games_this_week["AwayResult"] = np.where(games_this_week["AwayScore"] > games_this_week["HomeScore"], 1, 0)  
    
    df_season = df_season.append(games_this_week, ignore_index = True) 
  

    df_season.to_csv('./data/custom_games_by_season/2020_data.csv', header=True,  encoding='utf-8', index=False) 

In [25]:
# write update team up to stats

In [None]:
add_this_weeks_games(2020)