In [1]:
#packages
import pandas as pd
import numpy as np
import datetime
import sklearn
import pickle
import os
from matplotlib import pyplot as plt

# required machine learning packages
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.metrics import brier_score_loss, roc_auc_score, mean_squared_error, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.calibration import CalibratedClassifierCV as CCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingRegressor
import xgboost as xgb

# box plots
import seaborn as sns
# pairplot
from seaborn import pairplot
# Correlation plot
from statsmodels.graphics.correlation import plot_corr

pd.set_option('display.max_columns', None)

In [82]:
# set week of season model is being run for
week = 2
# set dates for model training data and model prediction data
training_start_date = '3/1/2003'
training_end_date = '9/19/2020'

prediction_start_date = '9/19/2020'
prediction_end_date = '12/31/2020'

In [83]:
# read weekly score and betting lines data
nfl_df = pd.read_csv('spreadspoke_scores.csv')
# read team lookup data
lookup_df = pd.read_csv('nfl_teams.csv')

In [84]:
# filter on season 2001 onward
nfl_df = nfl_df[nfl_df.schedule_season >= 2001]
# rename old NFL teams to new team names (St. Louis Rams -> LA Rams and San Diego Chargers -> LA Chargers)
nfl_df = nfl_df.replace(['Oakland Raiders','San Diego Chargers','St. Louis Rams'],['Las Vegas Raiders','Los Angeles Chargers','Los Angeles Rams'])
# filter lookup columns
lookup_df = lookup_df[['team_name','team_id']]
# filter out old team names from lookup
lookup_df = lookup_df[lookup_df['team_name'].isin(list(nfl_df['team_home'].unique()))].reset_index(drop=True)
# merge lookup to nfl to grab replace team_favorite_id with team_name
nfl_df = pd.merge(nfl_df,
                  lookup_df,
                  left_on='team_favorite_id',
                  right_on='team_id',
                  how='outer').rename(columns={'team_name':'team_favored'}).drop(columns=['team_favorite_id','team_id'])
nfl_df.loc[nfl_df.team_favored.isnull(), 'team_favored'] = 'PICK'
# replace playoff weeks with numbers
nfl_df = nfl_df.replace(['Wildcard','WildCard','Division','Conference','Superbowl','SuperBowl'],[18,18,19,20,21,21])
# conver data types
nfl_df['over_under_line'] = nfl_df['over_under_line'].astype(float)
nfl_df['schedule_week'] = nfl_df['schedule_week'].astype('int64')
nfl_df['schedule_date'] = pd.to_datetime(nfl_df['schedule_date'])

nfl_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,team_favored
0,2001-09-09,2001,1,False,Baltimore Ravens,17.0,6.0,Chicago Bears,-10.5,33.5,M&T Bank Stadium,False,72.0,6.0,79,,Baltimore Ravens
1,2001-09-23,2001,2,False,Cincinnati Bengals,21.0,10.0,Baltimore Ravens,-7.0,33.5,Paul Brown Stadium,False,66.0,6.0,73,,Baltimore Ravens
2,2001-10-07,2001,4,False,Baltimore Ravens,26.0,7.0,Tennessee Titans,-3.5,33.5,M&T Bank Stadium,False,48.0,14.0,52,,Baltimore Ravens
3,2001-10-21,2001,6,False,Cleveland Browns,24.0,14.0,Baltimore Ravens,-7.5,33.0,FirstEnergy Stadium,False,60.0,15.0,71,,Baltimore Ravens
4,2001-10-28,2001,7,False,Baltimore Ravens,18.0,17.0,Jacksonville Jaguars,-7.5,33.0,M&T Bank Stadium,False,44.0,10.0,45,,Baltimore Ravens


In [85]:
nfl_df.isna().sum()

schedule_date             0
schedule_season           0
schedule_week             0
schedule_playoff          0
team_home                 0
score_home               15
score_away               15
team_away                 0
spread_favorite           0
over_under_line           0
stadium                   0
stadium_neutral           0
weather_temperature     352
weather_wind_mph        352
weather_humidity       2666
weather_detail         3690
team_favored              0
dtype: int64

In [86]:
nfl_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5097 entries, 0 to 5096
Data columns (total 17 columns):
schedule_date          5097 non-null datetime64[ns]
schedule_season        5097 non-null int64
schedule_week          5097 non-null int64
schedule_playoff       5097 non-null bool
team_home              5097 non-null object
score_home             5082 non-null float64
score_away             5082 non-null float64
team_away              5097 non-null object
spread_favorite        5097 non-null float64
over_under_line        5097 non-null float64
stadium                5097 non-null object
stadium_neutral        5097 non-null bool
weather_temperature    4745 non-null float64
weather_wind_mph       4745 non-null float64
weather_humidity       2431 non-null object
weather_detail         1407 non-null object
team_favored           5097 non-null object
dtypes: bool(2), datetime64[ns](1), float64(6), int64(2), object(6)
memory usage: 647.1+ KB


In [87]:
nfl_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
schedule_season,5097.0,2010.076908,5.509943,2001.0,2005.0,2010.0,2015.0,2020.0
schedule_week,5097.0,9.46184,5.295781,1.0,5.0,10.0,14.0,21.0
score_home,5082.0,23.219008,10.344121,0.0,16.0,23.0,30.0,62.0
score_away,5082.0,20.80854,10.056842,0.0,13.25,20.0,27.0,59.0
spread_favorite,5097.0,-5.383657,3.408466,-26.5,-7.0,-4.5,-3.0,0.0
over_under_line,5097.0,43.369335,4.887941,30.0,40.0,43.5,46.5,63.5
weather_temperature,4745.0,60.692518,15.563017,-6.0,50.0,65.0,72.0,97.0
weather_wind_mph,4745.0,6.202107,5.451866,0.0,0.0,6.0,10.0,40.0


In [88]:
# filter columns from dataframe
nfl_df = nfl_df.drop(columns=['stadium','stadium_neutral','weather_temperature','weather_wind_mph','weather_humidity','weather_detail'])

In [89]:
nfl_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,spread_favorite,over_under_line,team_favored
0,2001-09-09,2001,1,False,Baltimore Ravens,17.0,6.0,Chicago Bears,-10.5,33.5,Baltimore Ravens
1,2001-09-23,2001,2,False,Cincinnati Bengals,21.0,10.0,Baltimore Ravens,-7.0,33.5,Baltimore Ravens
2,2001-10-07,2001,4,False,Baltimore Ravens,26.0,7.0,Tennessee Titans,-3.5,33.5,Baltimore Ravens
3,2001-10-21,2001,6,False,Cleveland Browns,24.0,14.0,Baltimore Ravens,-7.5,33.0,Baltimore Ravens
4,2001-10-28,2001,7,False,Baltimore Ravens,18.0,17.0,Jacksonville Jaguars,-7.5,33.0,Baltimore Ravens


In [90]:
# create empty dataframe to append to
appended_df = pd.DataFrame()

for team in list(nfl_df['team_home'].unique()):
    # select single team
    feature_engineer_df = nfl_df.loc[(nfl_df['team_home']==team) | (nfl_df['team_away']==team)].copy()
    feature_engineer_df['team'] = team
    # order data by season, week
    feature_engineer_df = feature_engineer_df.sort_values(['schedule_season','schedule_week'])
    # flag home games for selected team - home = 1, away = 0
    feature_engineer_df['home_or_away'] = np.where(feature_engineer_df['team_home'] == team, 1, 0)
    # create home team favored flag - home team favored = 1, home team underdog = 0
    feature_engineer_df['home_team_favored'] = np.where((feature_engineer_df['team_home']==team) & (feature_engineer_df['team_favored']==team), 1, 0)
    # team's oppponent
    feature_engineer_df['opponent'] = np.where(feature_engineer_df['team_away']==team, feature_engineer_df['team_home'], feature_engineer_df['team_away'])
    # score for selected team
    feature_engineer_df['score'] = np.where(feature_engineer_df['team_away']==team, feature_engineer_df['score_away'], feature_engineer_df['score_home'])
    # opponent's score
    feature_engineer_df['opponent_score'] = np.where(feature_engineer_df['team_away']==team, feature_engineer_df['score_home'], feature_engineer_df['score_away'])
    # home team win flag - home team win = 1, loss/tie = 0
    feature_engineer_df['home_team_win'] = np.where(feature_engineer_df['score_home']>feature_engineer_df['score_away'], 1, 0)
    # flag for the favored team covering the spread - favorite covers = 1, favorite doesn't cover = 0
    feature_engineer_df['home_team_covers'] = np.select([(feature_engineer_df['home_team_favored']==1) &
                                                           (feature_engineer_df['score_away'] - feature_engineer_df['score_home'] < feature_engineer_df['spread_favorite']),
                                                         (feature_engineer_df['home_team_favored']==1) &
                                                           (feature_engineer_df['score_away'] - feature_engineer_df['score_home'] > feature_engineer_df['spread_favorite']),
                                                         (feature_engineer_df['home_team_favored']==0) &
                                                           (feature_engineer_df['score_home'] - feature_engineer_df['score_away'] < feature_engineer_df['spread_favorite']),
                                                         (feature_engineer_df['home_team_favored']==0) &
                                                           (feature_engineer_df['score_home'] - feature_engineer_df['score_away'] > feature_engineer_df['spread_favorite']),
                                                         (feature_engineer_df['team_favored']=='PICK') & (feature_engineer_df['score_home']>feature_engineer_df['score_away']),
                                                         (feature_engineer_df['team_favored']=='PICK') & (feature_engineer_df['score_home']<feature_engineer_df['score_away']),
                                                         (feature_engineer_df['team_favored']=='PICK') & (feature_engineer_df['score_home']==feature_engineer_df['score_away'])],
                                                        [1,0,0,1,1,0,0])
                                                         
    # over/under flag - over = 1, under/push = 0
    feature_engineer_df['over_under'] = np.where(feature_engineer_df['score_home'] + feature_engineer_df['score_away'] > feature_engineer_df['over_under_line'], 1, 0)
    # create tempoary columns for computing win percentage, average score, and average allowed points
    feature_engineer_df['team_cum_sum_score'] = feature_engineer_df.groupby('schedule_season')['score'].cumsum()
    feature_engineer_df['team_cum_sum_pts_allowed'] = feature_engineer_df.groupby('schedule_season')['opponent_score'].cumsum()
    feature_engineer_df['team_game_count'] = feature_engineer_df.groupby('schedule_season')['team'].cumcount()+1
    # calculate average score. week's 1-3 are replaced with rolling 6 game score average
    feature_engineer_df['team_score_avg'] = np.where(feature_engineer_df['schedule_week'] <= 3,
                                                     feature_engineer_df.groupby('team')['score'].transform(lambda x: x.shift().rolling(window=6).mean()).round(),
                                                     (feature_engineer_df['team_cum_sum_score'] / feature_engineer_df['team_game_count']).round().shift(1)).astype('int64')
    # calculate average points allowed. week's 1-3 are replaced with rolling 6 game points allowed average
    feature_engineer_df['team_pts_allowed_avg'] = np.where(feature_engineer_df['schedule_week'] <= 3,
                                                           feature_engineer_df.groupby('team')['opponent_score'].transform(lambda x: x.shift().rolling(window=6).mean()).round(),
                                                           (feature_engineer_df['team_cum_sum_pts_allowed'] / feature_engineer_df['team_game_count']).round().shift(1)).astype('int64')
    # calculate team's win percentage
    feature_engineer_df['team_straight_up_win'] = np.where(feature_engineer_df['score'] > feature_engineer_df['opponent_score'], 1, 0)
    feature_engineer_df['team_win_pct_placeholder'] = (feature_engineer_df.groupby('schedule_season')['team_straight_up_win'].cumsum() / feature_engineer_df['team_game_count']).round(5)
    # replace week's 1-3 with last season's win percentage averaged in
    feature_engineer_df['team_win_pct'] = np.select([feature_engineer_df['schedule_week'] == 1,
                                                     feature_engineer_df['schedule_week'] == 2,
                                                     feature_engineer_df['schedule_week'] == 3,
                                                     feature_engineer_df['schedule_week'] >= 4],
                                                    [feature_engineer_df['team_win_pct_placeholder'].shift(),
                                                     feature_engineer_df.groupby('team')['team_win_pct_placeholder'].transform(lambda x: x.shift().rolling(window=2).mean()).round(5),
                                                     feature_engineer_df.groupby('team')['team_win_pct_placeholder'].transform(lambda x: x.shift().rolling(window=3).mean()).round(5),
                                                     feature_engineer_df['team_win_pct_placeholder']])
    # selected team against the spread
    feature_engineer_df['team_ats_win'] = np.where(((feature_engineer_df['team_favored']==team) &
                                                    (feature_engineer_df['opponent_score'] - feature_engineer_df['score'] < feature_engineer_df['spread_favorite'])) |
                                                   ((feature_engineer_df['team_favored']!=team) &
                                                    (feature_engineer_df['score'] - feature_engineer_df['opponent_score'] > feature_engineer_df['spread_favorite']) |
                                                   ((feature_engineer_df['team_favored']=='PICK') &
                                                    (feature_engineer_df['score']) > feature_engineer_df['opponent_score'])),
                                                  1,0)
    
    # calculate team's ATS percentage
    # df = df.rolling(4).apply(lambda x: (weights*x).sum())
    feature_engineer_df['team_ats_pct_placeholder'] = (feature_engineer_df.groupby('schedule_season')['team_ats_win'].cumsum() / feature_engineer_df['team_game_count']).round(5)
    feature_engineer_df['team_ats_pct'] = np.select([feature_engineer_df['schedule_week'] == 1,
                                                     feature_engineer_df['schedule_week'] == 2,
                                                     feature_engineer_df['schedule_week'] == 3,
                                                     feature_engineer_df['schedule_week'] >= 4],
                                                    [feature_engineer_df['team_ats_pct_placeholder'].shift(),
                                                     feature_engineer_df.groupby('team')['team_ats_pct_placeholder'].transform(lambda x: x.shift().rolling(window=2).mean()).round(5),
                                                     feature_engineer_df.groupby('team')['team_ats_pct_placeholder'].transform(lambda x: x.shift().rolling(window=3).mean()).round(5),
                                                     feature_engineer_df['team_ats_pct_placeholder']])
    # selected team over pct
    feature_engineer_df['team_over_pct_placeholder'] = (feature_engineer_df.groupby('schedule_season')['over_under'].cumsum() / feature_engineer_df['team_game_count']).round(5)
    feature_engineer_df['team_over_pct'] = np.select([feature_engineer_df['schedule_week'] == 1,
                                                      feature_engineer_df['schedule_week'] == 2,
                                                      feature_engineer_df['schedule_week'] == 3,
                                                      feature_engineer_df['schedule_week'] >= 4],
                                                     [feature_engineer_df['team_over_pct_placeholder'].shift(),
                                                      feature_engineer_df.groupby('team')['team_over_pct_placeholder'].transform(lambda x: x.shift().rolling(window=2).mean()).round(5),
                                                      feature_engineer_df.groupby('team')['team_over_pct_placeholder'].transform(lambda x: x.shift().rolling(window=3).mean()).round(5),
                                                      feature_engineer_df['team_over_pct_placeholder']])
    # selected team average +/- against spread - more positive is better
    feature_engineer_df['team_score_ats'] = np.select([(feature_engineer_df['team_favored']==team) & (feature_engineer_df['team_ats_win']==1),
                                                       (feature_engineer_df['team_favored']==team) & (feature_engineer_df['team_straight_up_win']==0),
                                                       (feature_engineer_df['team_favored']==team) & (feature_engineer_df['team_ats_win']==0),
                                                       (feature_engineer_df['team_favored']!=team) & (feature_engineer_df['team_straight_up_win']==1),
                                                       (feature_engineer_df['team_favored']!=team) & (feature_engineer_df['team_ats_win']==1),
                                                       (feature_engineer_df['team_favored']!=team) & (feature_engineer_df['team_ats_win']==0)],
                                                     [feature_engineer_df['spread_favorite'] + feature_engineer_df['score'] - feature_engineer_df['opponent_score'],
                                                      feature_engineer_df['spread_favorite'] - (feature_engineer_df['opponent_score'] - feature_engineer_df['score']),
                                                      feature_engineer_df['spread_favorite'] + feature_engineer_df['score'] - feature_engineer_df['opponent_score'],
                                                      abs(feature_engineer_df['spread_favorite']) + feature_engineer_df['score'] - feature_engineer_df['opponent_score'],
                                                      abs(feature_engineer_df['spread_favorite']) - feature_engineer_df['opponent_score'] + feature_engineer_df['score'],
                                                      abs(feature_engineer_df['spread_favorite']) - feature_engineer_df['opponent_score'] + feature_engineer_df['score']])
    feature_engineer_df['team_cum_sum_score_ats'] = feature_engineer_df.groupby('schedule_season')['team_score_ats'].cumsum()
    feature_engineer_df['team_avg_score_ats'] = np.where(feature_engineer_df['schedule_week'] <= 3,
                                                     feature_engineer_df.groupby('team')['team_score_ats'].transform(lambda x: x.shift().rolling(window=6).mean()).round(),
                                                     (feature_engineer_df['team_cum_sum_score_ats'] / feature_engineer_df['team_game_count']).round().shift(1)).astype('int64')
    # drop unneeded columns
    feature_engineer_df = feature_engineer_df.drop(columns=['team_straight_up_win','team_cum_sum_score','team_cum_sum_pts_allowed','team_game_count','team_win_pct_placeholder',
                                                           'team_ats_pct_placeholder','team_ats_win','team_over_pct_placeholder','team_score_ats','team_cum_sum_score_ats'])
    # append each team's dataframe
    appended_df = appended_df.append(feature_engineer_df)

appended_df = appended_df.reset_index(drop=True)
appended_df = appended_df.drop(columns=['opponent','score','opponent_score'])
appended_df = appended_df.loc[:, ~appended_df.columns.isin(nfl_df.columns.to_list()[1:])]

# combine engineered features to original data
home_df = pd.merge(nfl_df[['schedule_date','team_home','score_home','score_away','team_away','spread_favorite','over_under_line']],
                    appended_df.loc[appended_df.home_or_away==1],
                    left_on=['schedule_date','team_home'], right_on=['schedule_date','team'],
                    how='outer').rename(columns={'team_score_avg':'home_team_score_avg',
                                                 'team_pts_allowed_avg':'home_team_pts_allowed_avg',
                                                 'team_win_pct':'home_team_win_pct',
                                                 'team_ats_pct':'home_team_ats_pct',
                                                 'team_over_pct':'home_team_over_pct',
                                                 'team_avg_score_ats':'home_team_avg_score_ats'}).drop(columns='team')
final_df = pd.merge(home_df,
                    appended_df.loc[appended_df.home_or_away==0].filter(items=['schedule_date','team','team_score_avg','team_pts_allowed_avg','team_win_pct','team_ats_pct','team_over_pct','team_avg_score_ats']),
                    left_on=['schedule_date','team_away'], right_on=['schedule_date','team'],
                    how='outer').rename(columns={'team_score_avg':'away_team_score_avg',
                                                 'team_pts_allowed_avg':'away_team_pts_allowed_avg',
                                                 'team_win_pct':'away_team_win_pct',
                                                 'team_ats_pct':'away_team_ats_pct',
                                                 'team_over_pct':'away_team_over_pct',
                                                 'team_avg_score_ats':'away_team_avg_score_ats'}).drop(columns=['team','home_or_away'])

print(final_df.shape)
final_df.head()

(5097, 23)


Unnamed: 0,schedule_date,team_home,score_home,score_away,team_away,spread_favorite,over_under_line,home_team_favored,home_team_win,home_team_covers,over_under,home_team_score_avg,home_team_pts_allowed_avg,home_team_win_pct,home_team_ats_pct,home_team_over_pct,home_team_avg_score_ats,away_team_score_avg,away_team_pts_allowed_avg,away_team_win_pct,away_team_ats_pct,away_team_over_pct,away_team_avg_score_ats
0,2001-09-09,Baltimore Ravens,17.0,6.0,Chicago Bears,-10.5,33.5,1,1,1,0,-9223372036854775808,-9223372036854775808,,,,-9223372036854775808,-9223372036854775808,-9223372036854775808,,,,-9223372036854775808
1,2001-09-23,Cincinnati Bengals,21.0,10.0,Baltimore Ravens,-7.0,33.5,0,1,1,0,-9223372036854775808,-9223372036854775808,,,,-9223372036854775808,-9223372036854775808,-9223372036854775808,,,,-9223372036854775808
2,2001-10-07,Baltimore Ravens,26.0,7.0,Tennessee Titans,-3.5,33.5,1,1,1,0,16,13,0.75,0.75,0.0,-2,14,22,0.0,0.0,0.33333,-10
3,2001-10-21,Cleveland Browns,24.0,14.0,Baltimore Ravens,-7.5,33.0,0,1,1,1,17,15,0.66667,0.83333,0.83333,6,19,16,0.5,0.5,0.33333,0
4,2001-10-28,Baltimore Ravens,18.0,17.0,Jacksonville Jaguars,-7.5,33.0,1,1,0,1,18,17,0.57143,0.42857,0.42857,-3,15,14,0.33333,0.5,0.5,-4


In [91]:
data_df = final_df.loc[(final_df.schedule_date>training_start_date) & (final_df.schedule_date<=training_end_date)].drop(columns='schedule_date')
data_df = data_df[['team_home','team_away','score_home','score_away','home_team_win','home_team_covers','over_under','spread_favorite','over_under_line','home_team_favored',
                   'home_team_score_avg','home_team_pts_allowed_avg','home_team_win_pct','home_team_ats_pct','home_team_over_pct','home_team_avg_score_ats','away_team_score_avg','away_team_pts_allowed_avg',
                   'away_team_win_pct','away_team_ats_pct','away_team_over_pct','away_team_avg_score_ats']]
data_df.head()

Unnamed: 0,team_home,team_away,score_home,score_away,home_team_win,home_team_covers,over_under,spread_favorite,over_under_line,home_team_favored,home_team_score_avg,home_team_pts_allowed_avg,home_team_win_pct,home_team_ats_pct,home_team_over_pct,home_team_avg_score_ats,away_team_score_avg,away_team_pts_allowed_avg,away_team_win_pct,away_team_ats_pct,away_team_over_pct,away_team_avg_score_ats
16,Baltimore Ravens,Cleveland Browns,33.0,13.0,1,1,1,-2.5,39.0,1,22,27,0.21875,0.3125,0.75,-4,20,20,0.2647,0.35294,0.23529,3
17,Arizona Cardinals,Baltimore Ravens,18.0,26.0,0,0,1,-6.5,37.0,0,13,31,0.16667,0.16667,0.5,-11,20,18,0.6,0.6,0.6,4
18,Cincinnati Bengals,Baltimore Ravens,34.0,26.0,1,1,1,-1.0,36.0,0,15,21,0.33333,0.66667,0.16667,1,22,18,0.5,0.5,0.66667,3
19,Baltimore Ravens,Denver Broncos,26.0,6.0,1,1,0,-2.5,36.5,1,22,21,0.57143,0.57143,0.57143,1,25,16,0.625,0.5,0.375,5
20,Baltimore Ravens,Jacksonville Jaguars,24.0,17.0,1,0,1,-7.0,37.5,1,23,19,0.625,0.5,0.625,3,18,26,0.125,0.25,0.75,-6


In [92]:
pred_df = final_df.loc[(final_df.schedule_date>prediction_start_date) & (final_df.schedule_date<prediction_end_date)]
pred_df = pred_df[['team_home','team_away','spread_favorite','over_under_line','home_team_favored','home_team_score_avg','home_team_pts_allowed_avg','home_team_win_pct',
            'home_team_ats_pct','home_team_over_pct','home_team_avg_score_ats','away_team_score_avg','away_team_pts_allowed_avg','away_team_win_pct','away_team_ats_pct','away_team_over_pct','away_team_avg_score_ats']]
pred_df

Unnamed: 0,team_home,team_away,spread_favorite,over_under_line,home_team_favored,home_team_score_avg,home_team_pts_allowed_avg,home_team_win_pct,home_team_ats_pct,home_team_over_pct,home_team_avg_score_ats,away_team_score_avg,away_team_pts_allowed_avg,away_team_win_pct,away_team_ats_pct,away_team_over_pct,away_team_avg_score_ats
195,Houston Texans,Baltimore Ravens,-7.0,49.5,0,22,30,0.30556,0.22222,0.22222,-3,29,16,0.91177,0.79412,0.26471,5
396,Las Vegas Raiders,New Orleans Saints,-5.5,48.5,0,20,28,0.71875,0.75,0.71875,-5,36,24,0.88236,0.82353,0.76471,5
628,Seattle Seahawks,New England Patriots,-4.0,44.5,1,24,23,0.83334,0.72222,0.75,0,22,18,0.85294,0.76471,0.20588,-5
757,Tampa Bay Buccaneers,Carolina Panthers,-8.0,47.5,1,29,25,0.21875,0.15625,0.875,3,18,36,0.15625,0.1875,0.84375,-13
979,Green Bay Packers,Detroit Lions,-6.0,49.5,1,26,23,0.88889,0.80555,0.72222,2,17,26,0.09375,0.1875,0.8125,-2
1557,Indianapolis Colts,Minnesota Vikings,-3.5,49.5,1,23,29,0.21875,0.21875,0.78125,-7,23,24,0.30556,0.27778,0.75,1
2023,New York Jets,San Francisco 49ers,-7.0,41.5,0,16,21,0.21875,0.21875,0.71875,-3,27,23,0.39473,0.28948,0.23684,-1
2169,Tennessee Titans,Jacksonville Jaguars,-7.5,44.5,1,25,21,0.78948,0.28948,0.28948,6,20,26,0.6875,0.71875,0.75,0
2807,Los Angeles Chargers,Kansas City Chiefs,-8.5,47.5,0,22,23,0.65625,0.625,0.21875,-3,35,20,0.89474,0.84211,0.26316,7
3275,Los Angeles Rams,Philadelphia Eagles,-2.0,45.5,0,28,23,0.78125,0.8125,0.21875,4,23,19,0.26471,0.20588,0.73529,0


# Develop Models

## XGBoost

In [52]:
# def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
#                        model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
#                        do_probabilities = False):
#     gs = GridSearchCV(
#         estimator=model,
#         param_grid=param_grid, 
#         cv=cv, 
#         n_jobs=-1, 
#         scoring=scoring_fit,
#         verbose=2
#     )
#     fitted_model = gs.fit(X_train_data, y_train_data)
    
#     if do_probabilities:
#       pred = fitted_model.predict_proba(X_test_data)
#     else:
#       pred = fitted_model.predict(X_test_data)
    
#     return fitted_model, pred

In [86]:
# model = xgb.XGBRegressor()
# param_grid = {
#     'n_estimators': [400, 700, 1000],
#     'colsample_bytree': [0.7, 0.8],
#     'max_depth': [15,20,25],
#     'reg_alpha': [1.1, 1.2, 1.3],
#     'reg_lambda': [1.1, 1.2, 1.3],
#     'subsample': [0.7, 0.8, 0.9]
# }

# model, pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, 
#                                  param_grid, cv=5)

# print(np.sqrt(-model.best_score_))
# print(model.best_params_)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 24.1min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 34.9min
[Parallel(n_jobs=-1)]: Done 2430 out of 2430 | elapsed: 43.9min finished


10.26420993827411
{'colsample_bytree': 0.8, 'max_depth': 25, 'n_estimators': 400, 'reg_alpha': 1.1, 'reg_lambda': 1.3, 'subsample': 0.9}


In [87]:
# xgbReg = xgb.XGBRegressor(colsample_bytree=0.8, max_depth=25, n_estimators=400, reg_alpha=1.1, reg_lambda=1.3, subsample=0.9).fit(X_train, y_train)
# xgbReg.score(X_test, y_test)

-0.07699435221940787

In [88]:
# pickle.dump(xgbReg, open('xgb_home_score_model.sav', 'wb'))

In [53]:
# model = xgb.XGBRegressor()
# param_grid = {
#     'n_estimators': [400, 700, 1000],
#     'colsample_bytree': [0.7, 0.8],
#     'max_depth': [15,20,25],
#     'reg_alpha': [1.1, 1.2, 1.3],
#     'reg_lambda': [1.1, 1.2, 1.3],
#     'subsample': [0.7, 0.8, 0.9]
# }

# model, pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, 
#                                  param_grid, cv=5)

# print(np.sqrt(-model.best_score_))
# print(model.best_params_)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 35.2min
[Parallel(n_jobs=-1)]: Done 2430 out of 2430 | elapsed: 44.0min finished


10.11278179606647
{'colsample_bytree': 0.7, 'max_depth': 15, 'n_estimators': 400, 'reg_alpha': 1.1, 'reg_lambda': 1.2, 'subsample': 0.9}


In [74]:
# xgbReg = xgb.XGBRegressor(colsample_bytree=0.7, max_depth=15, n_estimators=400, reg_alpha=1.1, reg_lambda=1.2, subsample=0.9).fit(X_train, y_train)
# xgbReg.score(X_test, y_test)

0.007383341321407166

In [82]:
# pickle.dump(xgbReg, open('xgb_away_score_model.sav', 'wb'))

## Linear Regression to Predict Scores

In [63]:
X = data_df.iloc[:,7:]
y = data_df['score_home']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)

# Fit linear regression
reg = LinearRegression().fit(X_train, y_train)

# Make prediction on the testing data
pred = reg.predict(X_test)

reg.score(X_train, y_train)

0.25386578561884054

In [64]:
pd.concat([pd.DataFrame(data_df.iloc[:,7:].columns, columns=['featues']),pd.DataFrame(np.transpose(reg.coef_), columns=['coeff'])], axis = 1).sort_values(by='coeff', ascending=False)

Unnamed: 0,featues,coeff
7,home_team_over_pct,10.922088
13,away_team_over_pct,10.624385
5,home_team_win_pct,8.585758
6,home_team_ats_pct,7.022527
2,home_team_favored,1.862707
1,over_under_line,0.475122
14,away_team_avg_score_ats,0.319556
0,spread_favorite,-0.080777
10,away_team_pts_allowed_avg,-0.087948
3,home_team_score_avg,-0.088875


In [65]:
pickle.dump(reg, open("""Models/Week"""+str(week)+"""/reg_home_score_model_w"""+str(week)+"""_2020.sav""", 'wb'))

In [66]:
y = data_df['score_away']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)

# Fit linear regression
reg = LinearRegression().fit(X_train, y_train)

# Make prediction on the testing data
pred = reg.predict(X_test)

reg.score(X_train, y_train)

0.23481904931455866

In [67]:
pd.concat([pd.DataFrame(data_df.iloc[:,7:].columns, columns=['featues']),pd.DataFrame(np.transpose(reg.coef_), columns=['coeff'])], axis = 1).sort_values(by='coeff', ascending=False)

Unnamed: 0,featues,coeff
12,away_team_ats_pct,9.497795
7,home_team_over_pct,9.270143
13,away_team_over_pct,9.034679
11,away_team_win_pct,4.175239
1,over_under_line,0.535568
8,home_team_avg_score_ats,0.238489
9,away_team_score_avg,0.075912
0,spread_favorite,0.06625
4,home_team_pts_allowed_avg,-0.133126
3,home_team_score_avg,-0.27543


In [68]:
pickle.dump(reg, open("""Models/Week"""+str(week)+"""/reg_away_score_model_w"""+str(week)+"""_2020.sav""", 'wb'))

## Logistic Regression

In [69]:
y = data_df['home_team_win']

scaler = StandardScaler().fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)

# Fit linear regression
logreg = LogisticRegression().fit(X_train, y_train)

# Make prediction on the testing data
pred = logreg.predict(X_test)

logreg.score(X_test, y_test)

0.7682177348551361

In [70]:
pd.concat([pd.DataFrame(data_df.iloc[:,7:].columns, columns=['featues']),pd.DataFrame(np.transpose(logreg.coef_), columns=['coeff'])], axis = 1).sort_values(by='coeff', ascending=False)

Unnamed: 0,featues,coeff
5,home_team_win_pct,1.211257
14,away_team_avg_score_ats,0.397749
4,home_team_pts_allowed_avg,0.259835
6,home_team_ats_pct,0.239876
2,home_team_favored,0.120194
9,away_team_score_avg,0.09398
7,home_team_over_pct,0.082551
1,over_under_line,0.007166
13,away_team_over_pct,-0.011988
0,spread_favorite,-0.086707


In [71]:
pickle.dump(logreg, open("""Models/Week"""+str(week)+"""/home_team_win_model_w"""+str(week)+"""_2020.sav""", 'wb'))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.76      0.66      0.71       478
           1       0.78      0.85      0.81       661

    accuracy                           0.77      1139
   macro avg       0.77      0.75      0.76      1139
weighted avg       0.77      0.77      0.77      1139



In [72]:
y = data_df['home_team_covers']

# scaler = StandardScaler().fit(X)
# X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)

# Fit linear regression
logreg = LogisticRegression().fit(X_train, y_train)

# Make prediction on the testing data
pred = logreg.predict(X_test)

logreg.score(X_test, y_test)

0.7093942054433714

In [73]:
pd.concat([pd.DataFrame(data_df.iloc[:,7:].columns, columns=['featues']),pd.DataFrame(np.transpose(logreg.coef_), columns=['coeff'])], axis = 1).sort_values(by='coeff', ascending=False)

Unnamed: 0,featues,coeff
6,home_team_ats_pct,0.802198
14,away_team_avg_score_ats,0.579141
5,home_team_win_pct,0.380378
4,home_team_pts_allowed_avg,0.084715
9,away_team_score_avg,0.062229
0,spread_favorite,0.042816
7,home_team_over_pct,0.033605
13,away_team_over_pct,0.025697
10,away_team_pts_allowed_avg,-0.018053
3,home_team_score_avg,-0.063017


In [74]:
pickle.dump(logreg, open("""Models/Week"""+str(week)+"""/home_team_covers_model_w"""+str(week)+"""_2020.sav""", 'wb'))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.70      0.74      0.72       579
           1       0.72      0.68      0.70       560

    accuracy                           0.71      1139
   macro avg       0.71      0.71      0.71      1139
weighted avg       0.71      0.71      0.71      1139



In [75]:
y = data_df['over_under']

# scaler = StandardScaler().fit(X)
# X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)

# Fit linear regression
logreg = LogisticRegression().fit(X_train, y_train)

# Make prediction on the testing data
pred = logreg.predict(X_test)

logreg.score(X_test, y_test)

0.7014925373134329

In [76]:
pd.concat([pd.DataFrame(data_df.iloc[:,7:].columns, columns=['featues']),pd.DataFrame(np.transpose(reg.coef_), columns=['coeff'])], axis = 1).sort_values(by='coeff', ascending=False)

Unnamed: 0,featues,coeff
12,away_team_ats_pct,9.497795
7,home_team_over_pct,9.270143
13,away_team_over_pct,9.034679
11,away_team_win_pct,4.175239
1,over_under_line,0.535568
8,home_team_avg_score_ats,0.238489
9,away_team_score_avg,0.075912
0,spread_favorite,0.06625
4,home_team_pts_allowed_avg,-0.133126
3,home_team_score_avg,-0.27543


In [77]:
pickle.dump(logreg, open("""Models/Week"""+str(week)+"""/over_under_model_w"""+str(week)+"""_2020.sav""", 'wb'))
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.69      0.73      0.71       565
           1       0.72      0.67      0.69       574

    accuracy                           0.70      1139
   macro avg       0.70      0.70      0.70      1139
weighted avg       0.70      0.70      0.70      1139



# Scoring Model Test Data Performance

In [78]:
home_score_model = pickle.load(open("""Models/Week"""+str(week)+"""/reg_home_score_model_w"""+str(week)+"""_2020.sav""", 'rb'))
away_score_model = pickle.load(open("""Models/Week"""+str(week)+"""/reg_away_score_model_w"""+str(week)+"""_2020.sav""", 'rb'))
home_team_win_model = pickle.load(open("""Models/Week"""+str(week)+"""/home_team_win_model_w"""+str(week)+"""_2020.sav""", 'rb'))
home_team_covers_model = pickle.load(open("""Models/Week"""+str(week)+"""/home_team_covers_model_w"""+str(week)+"""_2020.sav""",'rb'))
over_under_model = pickle.load(open("""Models/Week"""+str(week)+"""/over_under_model_w"""+str(week)+"""_2020.sav""", 'rb'))

X = data_df.iloc[:,7:]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)

pred_test_df = pd.DataFrame()
pred_test_df['pred_score_home'] = home_score_model.predict(X_test)
pred_test_df['pred_score_away'] = away_score_model.predict(X_test)

scaler = StandardScaler().fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)

pred_test_df['pred_home_team_win'] = home_team_win_model.predict(X_test)
pred_test_df['pred_home_team_covers'] = home_team_covers_model.predict(X_test)
pred_test_df['pred_over_under'] = over_under_model.predict(X_test)

pred_test_df = pred_test_df[['pred_score_home','pred_score_away','pred_home_team_win','pred_home_team_covers','pred_over_under']]

# join predictions back to actual data
performance_df = pd.merge(pred_test_df, final_df[['score_home','score_away','home_team_favored','home_team_win','over_under','home_team_covers','spread_favorite','over_under_line']], how='inner', left_index=True, right_index=True).dropna()
# flag if score models predicted the correct team to cover - correct = 1, incorrect = 0
performance_df['team_cover_correct'] = np.where(((performance_df['home_team_favored']==1) & (performance_df['home_team_covers']==1) &
                                                 (performance_df['pred_score_home'] > performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_away'] - performance_df['pred_score_home'] < performance_df['spread_favorite'])) |
                                                ((performance_df['home_team_favored']==1) & (performance_df['home_team_covers']==0) &
                                                ((performance_df['pred_score_home'] < performance_df['pred_score_away'])) |
                                                 (((performance_df['pred_score_home'] > performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_away'] - performance_df['pred_score_home'] > performance_df['spread_favorite'])))) |
                                                ((performance_df['home_team_favored']==0) & (performance_df['home_team_covers']==0) &
                                                 (performance_df['pred_score_home'] < performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_home'] - performance_df['pred_score_away'] < performance_df['spread_favorite'])) |
                                                ((performance_df['home_team_favored']==0) & (performance_df['home_team_covers']==1) &
                                                 ((performance_df['pred_score_home'] > performance_df['pred_score_away']) |
                                                 ((performance_df['pred_score_home'] < performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_home'] - performance_df['pred_score_away'] > performance_df['spread_favorite'])))) |
                                                ((performance_df['spread_favorite']==0) &
                                                 ((performance_df['pred_score_home'] > performance_df['pred_score_away']) & (performance_df['score_home'] > performance_df['score_away']) |
                                                 (performance_df['pred_score_home'] < performance_df['pred_score_away']) & (performance_df['score_home'] < performance_df['score_away']))),
                                                1,0)
# flag if score models accurately predicted the straight up winner - correct = 1, incorrect = 0
performance_df['team_win_correct'] = np.where(((performance_df['pred_score_home'] > performance_df['pred_score_away']) & (performance_df['score_home'] > performance_df['score_away'])) |
                                              ((performance_df['pred_score_home'] < performance_df['pred_score_away']) & (performance_df['score_home'] < performance_df['score_away'])),
                                              1,0)
# flag if score models accurately predicted the straight up winner - correct = 1, incorrect = 0
performance_df['over_under_correct'] = np.where(((performance_df['pred_score_home'] + performance_df['pred_score_away'] > performance_df['over_under_line']) &
                                                (performance_df['score_home'] + performance_df['score_away'] > performance_df['over_under_line'])) |
                                               ((performance_df['pred_score_home'] + performance_df['pred_score_away'] < performance_df['over_under_line']) &
                                                (performance_df['score_home'] + performance_df['score_away'] < performance_df['over_under_line'])),
                                               1,0)

print('team_cover_pct:',(performance_df['team_cover_correct'].sum() / len(performance_df)).round(2),
      'team_win_pct:',(performance_df['team_win_correct'].sum() / len(performance_df)).round(2),
      'over_under_pct:',(performance_df['over_under_correct'].sum() / len(performance_df)).round(2))

team_cover_pct: 0.63 team_win_pct: 0.51 over_under_pct: 0.49


# Predictions

In [93]:
home_score_model = pickle.load(open("""Models/Week"""+str(week)+"""/reg_home_score_model_w"""+str(week)+"""_2020.sav""", 'rb'))
away_score_model = pickle.load(open("""Models/Week"""+str(week)+"""/reg_away_score_model_w"""+str(week)+"""_2020.sav""", 'rb'))
home_team_win_model = pickle.load(open("""Models/Week"""+str(week)+"""/home_team_win_model_w"""+str(week)+"""_2020.sav""", 'rb'))
home_team_covers_model = pickle.load(open("""Models/Week"""+str(week)+"""/home_team_covers_model_w"""+str(week)+"""_2020.sav""",'rb'))
over_under_model = pickle.load(open("""Models/Week"""+str(week)+"""/over_under_model_w"""+str(week)+"""_2020.sav""", 'rb'))

scaler = StandardScaler().fit(pred_df.iloc[:,2:])
scaled_pred_df = scaler.transform(pred_df.iloc[:,2:])

pred_df['pred_score_home'] = home_score_model.predict(pred_df.iloc[:,2:])
pred_df['pred_score_away'] = away_score_model.predict(pred_df.iloc[:,2:-1])
pred_df['pred_home_team_win'] = home_team_win_model.predict(scaled_pred_df)
pred_df['pred_home_team_covers'] = home_team_covers_model.predict(scaled_pred_df)
pred_df['pred_over_under'] = over_under_model.predict(scaled_pred_df)

pred_df = pred_df[['team_home','team_away','pred_score_home','pred_score_away','pred_home_team_win','pred_home_team_covers','pred_over_under']]
pred_df = pd.merge(pred_df, final_df[['home_team_favored','spread_favorite','over_under_line']], how='inner', left_index=True, right_index=True)
pred_df

Unnamed: 0,team_home,team_away,pred_score_home,pred_score_away,pred_home_team_win,pred_home_team_covers,pred_over_under,home_team_favored,spread_favorite,over_under_line
195,Houston Texans,Baltimore Ravens,10.367147,27.18036,0,0,0,0,-7.0,49.5
396,Las Vegas Raiders,New Orleans Saints,26.392849,27.316573,1,1,1,0,-5.5,48.5
628,Seattle Seahawks,New England Patriots,22.404478,24.052821,0,0,1,1,-4.0,44.5
757,Tampa Bay Buccaneers,Carolina Panthers,27.287593,26.675766,0,0,1,1,-8.0,47.5
979,Green Bay Packers,Detroit Lions,42.39796,17.401388,1,1,1,1,-6.0,49.5
1557,Indianapolis Colts,Minnesota Vikings,31.12532,25.938469,1,1,1,1,-3.5,49.5
2023,New York Jets,San Francisco 49ers,18.799911,23.187808,0,0,1,0,-7.0,41.5
2169,Tennessee Titans,Jacksonville Jaguars,21.628811,22.417008,0,0,0,1,-7.5,44.5
2807,Los Angeles Chargers,Kansas City Chiefs,15.400116,20.229601,1,1,0,0,-8.5,47.5
3275,Los Angeles Rams,Philadelphia Eagles,28.964268,15.315147,1,1,0,0,-2.0,45.5


# Prediction Performance

In [89]:
# join predictions back to actual data
performance_df = pd.merge(pred_df, final_df[['score_home','score_away','home_team_favored','home_team_win','over_under','home_team_covers','spread_favorite','over_under_line']], how='inner', left_index=True, right_index=True).dropna()
# flag if score models predicted the correct team to cover - correct = 1, incorrect = 0
performance_df['team_cover_correct'] = np.where(((performance_df['home_team_favored']==1) & (performance_df['home_team_covers']==1) &
                                                 (performance_df['pred_score_home'] > performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_away'] - performance_df['pred_score_home'] < performance_df['spread_favorite'])) |
                                                ((performance_df['home_team_favored']==1) & (performance_df['home_team_covers']==0) &
                                                ((performance_df['pred_score_home'] < performance_df['pred_score_away'])) |
                                                 (((performance_df['pred_score_home'] > performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_away'] - performance_df['pred_score_home'] > performance_df['spread_favorite'])))) |
                                                ((performance_df['home_team_favored']==0) & (performance_df['home_team_covers']==0) &
                                                 (performance_df['pred_score_home'] < performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_home'] - performance_df['pred_score_away'] < performance_df['spread_favorite'])) |
                                                ((performance_df['home_team_favored']==0) & (performance_df['home_team_covers']==1) &
                                                 ((performance_df['pred_score_home'] > performance_df['pred_score_away']) |
                                                 ((performance_df['pred_score_home'] < performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_home'] - performance_df['pred_score_away'] > performance_df['spread_favorite'])))) |
                                                ((performance_df['spread_favorite']==0) &
                                                 ((performance_df['pred_score_home'] > performance_df['pred_score_away']) & (performance_df['score_home'] > performance_df['score_away']) |
                                                 (performance_df['pred_score_home'] < performance_df['pred_score_away']) & (performance_df['score_home'] < performance_df['score_away']))),
                                                1,0)
# flag if score models accurately predicted the straight up winner - correct = 1, incorrect = 0
performance_df['team_win_correct'] = np.where(((performance_df['pred_score_home'] > performance_df['pred_score_away']) & (performance_df['score_home'] > performance_df['score_away'])) |
                                              ((performance_df['pred_score_home'] < performance_df['pred_score_away']) & (performance_df['score_home'] < performance_df['score_away'])),
                                              1,0)
# flag if score models accurately predicted the over - correct = 1, incorrect = 0
performance_df['over_under_correct'] = np.where(((performance_df['pred_score_home'] + performance_df['pred_score_away'] > performance_df['over_under_line']) &
                                                (performance_df['score_home'] + performance_df['score_away'] > performance_df['over_under_line'])) |
                                               ((performance_df['pred_score_home'] + performance_df['pred_score_away'] < performance_df['over_under_line']) &
                                                (performance_df['score_home'] + performance_df['score_away'] < performance_df['over_under_line'])),
                                               1,0)
# flag logistic models accurate predictions - correct = 1, incorrect = 0
performance_df['log_team_covers_correct'] = np.where(performance_df['pred_home_team_covers']==performance_df['home_team_covers'],1,0)
performance_df['log_team_win_correct'] = np.where(performance_df['pred_home_team_win']==performance_df['home_team_win'],1,0)
performance_df['log_over_under_correct'] = np.where(performance_df['pred_over_under']==performance_df['over_under'],1,0)

print('team_cover_pct:',(performance_df['team_cover_correct'].sum() / len(performance_df)).round(4)*100,
      'team_win_pct:',(performance_df['team_win_correct'].sum() / len(performance_df)).round(4)*100,
      'over_under_pct:',(performance_df['over_under_correct'].sum() / len(performance_df)).round(4)*100,
      'log_team_cover_pct:',(performance_df['log_team_covers_correct'].sum() / len(performance_df)).round(4)*100,
      'log_team_win_pct:',(performance_df['log_team_win_correct'].sum() / len(performance_df)).round(4)*100,
      'log_over_under_pct:',(performance_df['log_over_under_correct'].sum() / len(performance_df)).round(4)*100)

performance_df

team_cover_pct: 81.25 team_win_pct: 81.25 over_under_pct: 56.25 log_team_cover_pct: 81.25 log_team_win_pct: 81.25 log_over_under_pct: 43.75


Unnamed: 0,team_home,team_away,pred_score_home,pred_score_away,pred_home_team_win,pred_home_team_covers,pred_over_under,score_home,score_away,home_team_favored,home_team_win,over_under,home_team_covers,spread_favorite,over_under_line,team_cover_correct,team_win_correct,over_under_correct,log_team_covers_correct,log_team_win_correct,log_over_under_correct
194,Baltimore Ravens,Cleveland Browns,29.257308,18.058529,1,1,0,38.0,6.0,1,1,0,1,-7.0,47.5,1,1,1,1,1,1
395,New Orleans Saints,Tampa Bay Buccaneers,29.416807,18.739475,1,1,1,34.0,26.0,1,1,1,1,-4.0,47.5,1,1,1,1,1,1
434,Atlanta Falcons,Seattle Seahawks,20.543422,27.853281,0,0,0,25.0,38.0,0,0,1,0,-1.0,49.5,1,1,0,1,1,0
1183,Carolina Panthers,Las Vegas Raiders,22.073058,24.379613,0,0,0,30.0,34.0,0,0,1,0,-3.0,48.5,0,1,0,1,1,0
1343,Minnesota Vikings,Green Bay Packers,22.116934,22.619419,0,0,0,34.0,43.0,1,0,1,0,-1.5,44.0,1,1,1,1,1,0
1556,Jacksonville Jaguars,Indianapolis Colts,21.431572,21.858612,1,1,0,27.0,20.0,0,1,1,1,-7.0,44.5,1,0,0,1,1,0
1704,Los Angeles Rams,Dallas Cowboys,29.014695,23.029985,1,1,1,20.0,17.0,1,1,0,1,-1.0,51.5,1,1,0,1,1,0
1882,Cincinnati Bengals,Los Angeles Chargers,18.308839,23.556904,0,0,1,13.0,16.0,0,0,0,0,-2.5,41.5,1,1,0,1,1,0
2023,San Francisco 49ers,Arizona Cardinals,30.530232,17.653929,1,1,0,20.0,24.0,1,0,0,0,-6.5,48.5,0,0,1,0,0,1
2169,Denver Broncos,Tennessee Titans,19.666499,22.37156,0,1,1,14.0,16.0,0,0,0,1,-3.0,41.5,1,1,0,1,1,0


In [126]:
# previous_pred_df = pd.read_csv('predictions.csv')

# weekly_df = pd.merge(nfl_df[['schedule_date','schedule_week']], performance_df, left_index=True, right_index=True)
# weekly_df = previous_pred_df.append(weekly_df)
# weekly_df.to_csv('predictions.csv', index=False)