In [4]:
# import libraries
import os
import requests
import pandas as pd
import numpy as np
from pandas import json_normalize 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# for regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# for metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score 

# for plots 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import pairplot

# model save/load
import pickle

In [5]:
# get api key
api_key = os.environ['SPORTS_DATA_IO_API']
print(api_key)

4baa2aa7dbd644bc926c3818dde19936


In [6]:
def custom_regular_season_dataframe(season):
    # get custom dataframes
    if season != 2020:
        weeks = 17
    else:
        weeks = 11
        
    # load current season data    
    df_current_season = pd.read_csv('./data/season_team_data/{0}_data.csv'.format(str(season)))
    df_current_season.set_index('Team')
    # bool for loading in past season
    previous_season_exists = False
    # load in the past seasons data if it exists
    try:
        df_past_season = pd.read_csv('./data/season_team_data/{0}_data.csv'.format(str(season - 1)))
    except:
        previous_season_exists=False
    else:
        previous_season_exists=True

        
    # create new data frame without week 1 columns
    columns=['TotalScore','TotalWins', 'TotalFirstDowns', 'ThirdDownPercentage', 'TotalTimeOfPossession', 'GamesPlayed']
    current_season_dataframe =  pd.DataFrame(index=df_current_season['Team'], columns=columns)
    current_season_dataframe = current_season_dataframe.fillna(0)

    # populate week 1 averages with data from previous season
    if previous_season_exists:
        for team, row in current_season_dataframe.iterrows():
            current_season_dataframe.at[team, 'avg_up_to_week_1'] = df_past_season.loc[team,'Score'] / 16
            current_season_dataframe.at[team, 'first_downs_up_to_week_1'] = df_past_season.loc[team,'FirstDowns'] / 16
            current_season_dataframe.at[team, 'wins_up_to_week_1'] = 0.0
            current_season_dataframe.at[team, 'third_down_percentage_up_to_week_1'] = df_past_season.loc[team,'ThirdDownPercentage'] 
            current_season_dataframe.at[team, 'time_of_possession_up_to_week_1'] = df_past_season.loc[team,'TimeOfPossession']
    
    
    
    # loop through each regular seasons weeks games
    for week in range(1, weeks + 1):
        df_week_data = pd.read_csv('./data/season_game_data/{0}/week_{1}_data.csv'.format(season, week))
        for _, row in df_week_data.iterrows():
            # ---------------------------------------update each teams score per week------------------------------------------
            current_season_dataframe.loc[row.Team,row.Week] = row.Score
            current_season_dataframe[row.Week] = current_season_dataframe[row.Week].fillna(0)
            
            # ---------------------------------------update each teams games played -------------------------------------------
            current_season_dataframe.loc[row.Team,'GamesPlayed'] += 1
            
            # ----------------------------------------------update score to week------------------------------------------------
            current_season_dataframe.loc[row.Team,'TotalScore'] += row.Score 
            
            update_avg_score = current_season_dataframe.loc[row.Team,'TotalScore'] / current_season_dataframe.loc[row.Team,'GamesPlayed']
            
            current_season_dataframe.loc[row.Team,'avg_up_to_week_{0}'.format(row.Week + 1)] = update_avg_score 
            
            current_season_dataframe['avg_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['avg_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['TotalScore'] / current_season_dataframe.loc[row.Team,'GamesPlayed'])

            # ----------------------------------- store wins per team up to a certain week ----------------------------------
            if row.Score > row.OpponentScore:
                current_season_dataframe.loc[row.Team,'TotalWins'] += 1
            
            current_season_dataframe.loc[row.Team,'wins_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe.loc[row.Team,'TotalWins']
            
            current_season_dataframe['wins_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['wins_up_to_week_{0}'.format(row.Week + 1)].fillna(0)
    
            # ----------------------------------- store first downs per team up to certain week ----------------------------------
            current_season_dataframe.loc[row.Team,'TotalFirstDowns'] += row.FirstDowns 
                    
            update_first_downs = current_season_dataframe.loc[row.Team,'TotalFirstDowns'] / current_season_dataframe.loc[row.Team,'GamesPlayed']
            current_season_dataframe.loc[row.Team,'first_downs_up_to_week_{0}'.format(row.Week + 1)] = update_first_downs
            
            current_season_dataframe['first_downs_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['first_downs_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['TotalFirstDowns'] / current_season_dataframe.loc[row.Team,'GamesPlayed'])

            # ----------------------------------- calculate third down % per team up to certain week ----------------------------------
            current_season_dataframe.loc[row.Team,'ThirdDownPercentage'] += row.ThirdDownPercentage 
    
            update_third_down_percentage = current_season_dataframe.loc[row.Team,'ThirdDownPercentage'] / current_season_dataframe.loc[row.Team,'GamesPlayed']

            current_season_dataframe.loc[row.Team,'third_down_percentage_up_to_week_{0}'.format(row.Week + 1)] = update_third_down_percentage

        
            current_season_dataframe['third_down_percentage_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['third_down_percentage_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['ThirdDownPercentage'] / current_season_dataframe.loc[row.Team,'GamesPlayed'])

            # ----------------------------------- calculate average time of possession per team ----------------------------------
            team_top = float('{0}.{1}'.format(row.TimeOfPossessionMinutes,row.TimeOfPossessionSeconds))
            
            current_season_dataframe.loc[row.Team,'TotalTimeOfPossession'] += team_top
            
            update_time_of_possession = current_season_dataframe.loc[row.Team,'TotalTimeOfPossession'] / current_season_dataframe.loc[row.Team,'GamesPlayed']
            
            current_season_dataframe.loc[row.Team,'time_of_possession_up_to_week_{0}'.format(row.Week + 1)] = update_time_of_possession

            current_season_dataframe['time_of_possession_up_to_week_{0}'.format(row.Week + 1)] = current_season_dataframe['time_of_possession_up_to_week_{0}'.format(row.Week + 1)].fillna(current_season_dataframe['TotalTimeOfPossession'] / current_season_dataframe.loc[row.Team,'GamesPlayed'])
        

    return current_season_dataframe
    
    

In [73]:
def custom_post_season_dataframe(season):
    
    # load that seasons end of season data
    df_current_season = pd.read_csv('./data/custom_season_team_data/{0}_data.csv'.format(str(season)))
    
    
    columns=['TotalScore', 'TotalFirstDowns', 'ThirdDownPercentage', 'TotalTimeOfPossession', 'GamesPlayed']
    current_season_dataframe =  pd.DataFrame(index=df_current_season['Team'], columns=columns)
    current_season_dataframe = current_season_dataframe.fillna(0)

    
    # set index to be team name
    df_current_season = df_current_season.set_index('Team')
    
    # populate week 1 averages with data from previous season
    for team, row in current_season_dataframe.iterrows():
        current_season_dataframe.at[team, 'avg_up_to_week_1_playoffs'] = df_current_season.loc[team,'avg_up_to_week_18']
        current_season_dataframe.at[team, 'first_downs_up_to_week_1_playoffs'] = df_current_season.loc[team,'first_downs_up_to_week_18']
        current_season_dataframe.at[team, 'wins_up_to_week_1_playoffs'] = df_current_season.loc[team,'wins_up_to_week_18'] 
        current_season_dataframe.at[team, 'third_down_percentage_up_to_week_1_playoffs'] = df_current_season.loc[team,'third_down_percentage_up_to_week_18'] 
        current_season_dataframe.at[team, 'time_of_possession_up_to_week_1_playoffs'] = df_current_season.loc[team,'time_of_possession_up_to_week_18']

    
    # loop through each post seasons weeks games
    if season != 2020:
        for week in range(1, 5):
            df_week_data = pd.read_csv('./data/postseason_game_data/{0}/week_{1}_data.csv'.format(season, week))
            
            for _, row in df_week_data.iterrows():
                # ---------------------------------------update each teams score per week------------------------------------------
                current_season_dataframe.loc[row.Team,row.Week] = row.Score
                current_season_dataframe[row.Week] = current_season_dataframe[row.Week].fillna(0.0)

                # ---------------------------------------update each teams games played -------------------------------------------
                current_season_dataframe.loc[row.Team,'GamesPlayed'] += 1

                # ----------------------------------------------update score to week------------------------------------------------
                current_season_dataframe.loc[row.Team,'TotalScore'] += row.Score 

                update_avg_score = current_season_dataframe.loc[row.Team,'TotalScore'] / current_season_dataframe.loc[row.Team,'GamesPlayed']

                current_season_dataframe.loc[row.Team,'avg_up_to_week_{0}_playoffs'.format(row.Week + 1)] = update_avg_score 

                if row.Week > 1:
                    current_season_dataframe['avg_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['avg_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(0.0)
                else:
                    current_season_dataframe['avg_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['avg_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(current_season_dataframe.loc[row.Team,'avg_up_to_week_{0}_playoffs'.format(row.Week)])

                # ----------------------------------- store wins per team up to a certain week ----------------------------------
                '''
                after week 1 we can make wins up to week in playoffs zero for every team because they will all have the same
                amount of wins at any given week otherwise they wouldn't be there
                '''
                current_season_dataframe.loc[row.Team,'wins_up_to_week_{0}_playoffs'.format(row.Week + 1)] = 0.0
                current_season_dataframe = current_season_dataframe.fillna(0.0)

                # ----------------------------------- store first downs per team up to certain week ----------------------------------
                current_season_dataframe.loc[row.Team,'TotalFirstDowns'] += row.FirstDowns 

                update_first_downs = current_season_dataframe.loc[row.Team,'TotalFirstDowns'] / current_season_dataframe.loc[row.Team,'GamesPlayed']
                current_season_dataframe.loc[row.Team,'first_downs_up_to_week_{0}_playoffs'.format(row.Week + 1)] = update_first_downs
                
                if row.Week > 1:
                    current_season_dataframe['first_downs_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['first_downs_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(0)
                else:
                    current_season_dataframe['first_downs_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['first_downs_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(current_season_dataframe.loc[row.Team,'first_downs_up_to_week_{0}_playoffs'.format(row.Week)])

                # ----------------------------------- calculate third down % per team up to certain week ----------------------------------
                current_season_dataframe.loc[row.Team,'ThirdDownPercentage'] += row.ThirdDownPercentage 

                update_third_down_percentage = current_season_dataframe.loc[row.Team,'ThirdDownPercentage'] / current_season_dataframe.loc[row.Team,'GamesPlayed']

                current_season_dataframe.loc[row.Team,'third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week + 1)] = update_third_down_percentage

                if row.Week > 1:
                    current_season_dataframe['third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(0)
                else:
                    current_season_dataframe['third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(current_season_dataframe.loc[row.Team,'third_down_percentage_up_to_week_{0}_playoffs'.format(row.Week)])

                # ----------------------------------- calculate average time of possession per team ----------------------------------
                team_top = float('{0}.{1}'.format(row.TimeOfPossessionMinutes,row.TimeOfPossessionSeconds))

                current_season_dataframe.loc[row.Team,'TotalTimeOfPossession'] += team_top

                update_time_of_possession = current_season_dataframe.loc[row.Team,'TotalTimeOfPossession'] / current_season_dataframe.loc[row.Team,'GamesPlayed']

                current_season_dataframe.loc[row.Team,'time_of_possession_up_to_week_{0}_playoffs'.format(row.Week + 1)] = update_time_of_possession
                
                if row.Week > 1:
                    current_season_dataframe['time_of_possession_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['time_of_possession_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(0)
                else:
                    current_season_dataframe['time_of_possession_up_to_week_{0}_playoffs'.format(row.Week + 1)] = current_season_dataframe['time_of_possession_up_to_week_{0}_playoffs'.format(row.Week + 1)].fillna(current_season_dataframe.loc[row.Team,'time_of_possession_up_to_week_{0}_playoffs'.format(row.Week)])
        return current_season_dataframe

In [74]:
# create list of seasons
seasons = [2017, 2018,2019]
custom_dataframes = {}

# for season in seasons:
#     df = custom_dataframe(season)
#     df.to_csv('./data/custom_season_team_data/{0}_data.csv'.format(str(season)), header=True,  encoding='utf-8') 

for season in seasons:
    custom_dataframes[season] = custom_post_season_dataframe(season)

In [75]:
custom_dataframes[2019]

Unnamed: 0_level_0,TotalScore,TotalFirstDowns,ThirdDownPercentage,TotalTimeOfPossession,GamesPlayed,avg_up_to_week_1_playoffs,first_downs_up_to_week_1_playoffs,wins_up_to_week_1_playoffs,third_down_percentage_up_to_week_1_playoffs,time_of_possession_up_to_week_1_playoffs,1,avg_up_to_week_2_playoffs,wins_up_to_week_2_playoffs,first_downs_up_to_week_2_playoffs,third_down_percentage_up_to_week_2_playoffs,time_of_possession_up_to_week_2_playoffs,2,avg_up_to_week_3_playoffs,wins_up_to_week_3_playoffs,first_downs_up_to_week_3_playoffs,third_down_percentage_up_to_week_3_playoffs,time_of_possession_up_to_week_3_playoffs,3,avg_up_to_week_4_playoffs,wins_up_to_week_4_playoffs,first_downs_up_to_week_4_playoffs,third_down_percentage_up_to_week_4_playoffs,time_of_possession_up_to_week_4_playoffs,4,avg_up_to_week_5_playoffs,wins_up_to_week_5_playoffs,first_downs_up_to_week_5_playoffs,third_down_percentage_up_to_week_5_playoffs,time_of_possession_up_to_week_5_playoffs
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
ARI,0,0,0.0,0.0,0,22.5625,19.625,5.0,34.925,27.579375,0.0,19.625,0.0,19.625,35.84375,29.950625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ATL,0,0,0.0,0.0,0,23.8125,23.9375,7.0,42.08125,30.80625,0.0,19.625,0.0,19.625,35.84375,29.950625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BAL,12,29,61.1,32.6,1,33.1875,24.125,14.0,47.0375,34.615,0.0,19.625,0.0,19.625,35.84375,29.950625,12.0,12.0,0.0,29.0,61.1,32.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BUF,19,24,52.4,35.15,1,19.625,19.625,10.0,35.84375,29.950625,19.0,19.0,0.0,24.0,52.4,35.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CAR,0,0,0.0,0.0,0,21.25,20.9375,5.0,32.5375,28.76,0.0,19.625,0.0,19.625,35.84375,29.950625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CHI,0,0,0.0,0.0,0,17.5,18.5625,8.0,35.39375,30.1275,0.0,19.625,0.0,19.625,35.84375,29.950625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CIN,0,0,0.0,0.0,0,17.4375,19.5,2.0,35.7875,29.4325,0.0,19.625,0.0,19.625,35.84375,29.950625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CLE,0,0,0.0,0.0,0,20.9375,19.0625,6.0,35.73125,29.5675,0.0,19.625,0.0,19.625,35.84375,29.950625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DAL,0,0,0.0,0.0,0,27.125,23.6875,8.0,46.38125,29.423125,0.0,19.625,0.0,19.625,35.84375,29.950625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DEN,0,0,0.0,0.0,0,17.625,17.4375,7.0,32.1375,29.150625,0.0,19.625,0.0,19.625,35.84375,29.950625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df = pd.read_csv('./data/season_team_data/2017_data.csv')
df.head()