In [1]:
#packages
import pandas as pd
import numpy as np
import datetime
import sklearn
import pickle
import os
from matplotlib import pyplot as plt

# required machine learning packages
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.metrics import brier_score_loss, roc_auc_score, mean_squared_error, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.calibration import CalibratedClassifierCV as CCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingRegressor
import xgboost as xgb

# box plots
import seaborn as sns
# pairplot
from seaborn import pairplot
# Correlation plot
from statsmodels.graphics.correlation import plot_corr

pd.set_option('display.max_columns', None)

In [2]:
# set dates for model training data and model prediction data
training_start_date = '3/1/2003'
training_end_date = '9/23/2020'

prediction_start_date = '9/23/2020'
prediction_end_date = '12/31/2020'

In [3]:
# read weekly score and betting lines data
nfl_df = pd.read_csv('spreadspoke_scores.csv')
# read team lookup data
lookup_df = pd.read_csv('nfl_teams.csv')

In [4]:
# filter on season 2001 onward
nfl_df = nfl_df[nfl_df.schedule_season >= 2001]
# rename old NFL teams to new team names (St. Louis Rams -> LA Rams and San Diego Chargers -> LA Chargers)
nfl_df = nfl_df.replace(['Oakland Raiders','San Diego Chargers','St. Louis Rams'],['Las Vegas Raiders','Los Angeles Chargers','Los Angeles Rams'])
# filter lookup columns
lookup_df = lookup_df[['team_name','team_id']]
# filter out old team names from lookup
lookup_df = lookup_df[lookup_df['team_name'].isin(list(nfl_df['team_home'].unique()))].reset_index(drop=True)
# merge lookup to nfl to grab replace team_favorite_id with team_name
nfl_df = pd.merge(nfl_df,
                  lookup_df,
                  left_on='team_favorite_id',
                  right_on='team_id',
                  how='outer').rename(columns={'team_name':'team_favored'}).drop(columns=['team_favorite_id','team_id'])
nfl_df.loc[nfl_df.team_favored.isnull(), 'team_favored'] = 'PICK'
# replace playoff weeks with numbers
nfl_df = nfl_df.replace(['Wildcard','WildCard','Division','Conference','Superbowl','SuperBowl'],[18,18,19,20,21,21])
# conver data types
nfl_df['over_under_line'] = nfl_df['over_under_line'].astype(float)
nfl_df['schedule_week'] = nfl_df['schedule_week'].astype('int64')
nfl_df['schedule_date'] = pd.to_datetime(nfl_df['schedule_date'])

nfl_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,team_favored
0,2001-09-09,2001,1,False,Baltimore Ravens,17.0,6.0,Chicago Bears,-10.5,33.5,M&T Bank Stadium,False,72.0,6.0,79,,Baltimore Ravens
1,2001-09-23,2001,2,False,Cincinnati Bengals,21.0,10.0,Baltimore Ravens,-7.0,33.5,Paul Brown Stadium,False,66.0,6.0,73,,Baltimore Ravens
2,2001-10-07,2001,4,False,Baltimore Ravens,26.0,7.0,Tennessee Titans,-3.5,33.5,M&T Bank Stadium,False,48.0,14.0,52,,Baltimore Ravens
3,2001-10-21,2001,6,False,Cleveland Browns,24.0,14.0,Baltimore Ravens,-7.5,33.0,FirstEnergy Stadium,False,60.0,15.0,71,,Baltimore Ravens
4,2001-10-28,2001,7,False,Baltimore Ravens,18.0,17.0,Jacksonville Jaguars,-7.5,33.0,M&T Bank Stadium,False,44.0,10.0,45,,Baltimore Ravens


In [5]:
nfl_df.isna().sum()

schedule_date             0
schedule_season           0
schedule_week             0
schedule_playoff          0
team_home                 0
score_home               16
score_away               16
team_away                 0
spread_favorite           0
over_under_line           0
stadium                   0
stadium_neutral           0
weather_temperature     368
weather_wind_mph        368
weather_humidity       2682
weather_detail         3706
team_favored              0
dtype: int64

In [6]:
nfl_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5113 entries, 0 to 5112
Data columns (total 17 columns):
schedule_date          5113 non-null datetime64[ns]
schedule_season        5113 non-null int64
schedule_week          5113 non-null int64
schedule_playoff       5113 non-null bool
team_home              5113 non-null object
score_home             5097 non-null float64
score_away             5097 non-null float64
team_away              5113 non-null object
spread_favorite        5113 non-null float64
over_under_line        5113 non-null float64
stadium                5113 non-null object
stadium_neutral        5113 non-null bool
weather_temperature    4745 non-null float64
weather_wind_mph       4745 non-null float64
weather_humidity       2431 non-null object
weather_detail         1407 non-null object
team_favored           5113 non-null object
dtypes: bool(2), datetime64[ns](1), float64(6), int64(2), object(6)
memory usage: 649.1+ KB


In [7]:
nfl_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
schedule_season,5113.0,2010.10796,5.529166,2001.0,2005.0,2010.0,2015.0,2020.0
schedule_week,5113.0,9.441619,5.299792,1.0,5.0,10.0,14.0,21.0
score_home,5097.0,23.231509,10.341868,0.0,16.0,23.0,30.0,62.0
score_away,5097.0,20.821071,10.055096,0.0,14.0,20.0,27.0,59.0
spread_favorite,5113.0,-5.381674,3.405625,-26.5,-7.0,-4.5,-3.0,0.0
over_under_line,5113.0,43.383337,4.89241,30.0,40.0,43.5,46.5,63.5
weather_temperature,4745.0,60.692518,15.563017,-6.0,50.0,65.0,72.0,97.0
weather_wind_mph,4745.0,6.202107,5.451866,0.0,0.0,6.0,10.0,40.0


In [8]:
# filter columns from dataframe
nfl_df = nfl_df.drop(columns=['stadium','stadium_neutral','weather_temperature','weather_wind_mph','weather_humidity','weather_detail'])

In [9]:
nfl_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,spread_favorite,over_under_line,team_favored
0,2001-09-09,2001,1,False,Baltimore Ravens,17.0,6.0,Chicago Bears,-10.5,33.5,Baltimore Ravens
1,2001-09-23,2001,2,False,Cincinnati Bengals,21.0,10.0,Baltimore Ravens,-7.0,33.5,Baltimore Ravens
2,2001-10-07,2001,4,False,Baltimore Ravens,26.0,7.0,Tennessee Titans,-3.5,33.5,Baltimore Ravens
3,2001-10-21,2001,6,False,Cleveland Browns,24.0,14.0,Baltimore Ravens,-7.5,33.0,Baltimore Ravens
4,2001-10-28,2001,7,False,Baltimore Ravens,18.0,17.0,Jacksonville Jaguars,-7.5,33.0,Baltimore Ravens


In [10]:
# create empty dataframe to append to
appended_df = pd.DataFrame()

for team in list(nfl_df['team_home'].unique()):
    # select single team
    feature_engineer_df = nfl_df.loc[(nfl_df['team_home']==team) | (nfl_df['team_away']==team)].copy()
    feature_engineer_df['team'] = team
    # order data by season, week
    feature_engineer_df = feature_engineer_df.sort_values(['schedule_season','schedule_week'])
    # flag home games for selected team - home = 1, away = 0
    feature_engineer_df['home_or_away'] = np.where(feature_engineer_df['team_home'] == team, 1, 0)
    # create home team favored flag - home team favored = 1, home team underdog = 0
    feature_engineer_df['home_team_favored'] = np.where((feature_engineer_df['team_home']==team) & (feature_engineer_df['team_favored']==team), 1, 0)
    # team's oppponent
    feature_engineer_df['opponent'] = np.where(feature_engineer_df['team_away']==team, feature_engineer_df['team_home'], feature_engineer_df['team_away'])
    # score for selected team
    feature_engineer_df['score'] = np.where(feature_engineer_df['team_away']==team, feature_engineer_df['score_away'], feature_engineer_df['score_home'])
    # opponent's score
    feature_engineer_df['opponent_score'] = np.where(feature_engineer_df['team_away']==team, feature_engineer_df['score_home'], feature_engineer_df['score_away'])
    # home team win flag - home team win = 1, loss/tie = 0
    feature_engineer_df['home_team_win'] = np.where(feature_engineer_df['score_home']>feature_engineer_df['score_away'], 1, 0)
    # flag for the favored team covering the spread - favorite covers = 1, favorite doesn't cover = 0
    feature_engineer_df['home_team_covers'] = np.select([(feature_engineer_df['home_team_favored']==1) &
                                                           (feature_engineer_df['score_away'] - feature_engineer_df['score_home'] < feature_engineer_df['spread_favorite']),
                                                         (feature_engineer_df['home_team_favored']==1) &
                                                           (feature_engineer_df['score_away'] - feature_engineer_df['score_home'] > feature_engineer_df['spread_favorite']),
                                                         (feature_engineer_df['home_team_favored']==0) &
                                                           (feature_engineer_df['score_home'] - feature_engineer_df['score_away'] < feature_engineer_df['spread_favorite']),
                                                         (feature_engineer_df['home_team_favored']==0) &
                                                           (feature_engineer_df['score_home'] - feature_engineer_df['score_away'] > feature_engineer_df['spread_favorite']),
                                                         (feature_engineer_df['team_favored']=='PICK') & (feature_engineer_df['score_home']>feature_engineer_df['score_away']),
                                                         (feature_engineer_df['team_favored']=='PICK') & (feature_engineer_df['score_home']<feature_engineer_df['score_away']),
                                                         (feature_engineer_df['team_favored']=='PICK') & (feature_engineer_df['score_home']==feature_engineer_df['score_away'])],
                                                        [1,0,0,1,1,0,0])
                                                         
    # over/under flag - over = 1, under/push = 0
    feature_engineer_df['over_under'] = np.where(feature_engineer_df['score_home'] + feature_engineer_df['score_away'] > feature_engineer_df['over_under_line'], 1, 0)
    # create tempoary columns for computing win percentage, average score, and average allowed points
    feature_engineer_df['team_cum_sum_score'] = feature_engineer_df.groupby('schedule_season')['score'].cumsum()
    feature_engineer_df['team_cum_sum_pts_allowed'] = feature_engineer_df.groupby('schedule_season')['opponent_score'].cumsum()
    feature_engineer_df['team_game_count'] = feature_engineer_df.groupby('schedule_season')['team'].cumcount()+1
    # calculate average score. week's 1-3 are replaced with rolling 6 game score average
    feature_engineer_df['team_score_avg'] = np.where(feature_engineer_df['schedule_week'] <= 3,
                                                     feature_engineer_df.groupby('team')['score'].transform(lambda x: x.shift().rolling(window=6).mean()).round(),
                                                     (feature_engineer_df['team_cum_sum_score'] / feature_engineer_df['team_game_count']).round().shift(1)).astype('int64')
    # calculate average points allowed. week's 1-3 are replaced with rolling 6 game points allowed average
    feature_engineer_df['team_pts_allowed_avg'] = np.where(feature_engineer_df['schedule_week'] <= 3,
                                                           feature_engineer_df.groupby('team')['opponent_score'].transform(lambda x: x.shift().rolling(window=6).mean()).round(),
                                                           (feature_engineer_df['team_cum_sum_pts_allowed'] / feature_engineer_df['team_game_count']).round().shift(1)).astype('int64')
    # calculate team's win percentage
    feature_engineer_df['team_straight_up_win'] = np.where(feature_engineer_df['score'] > feature_engineer_df['opponent_score'], 1, 0)
    feature_engineer_df['team_win_pct_placeholder'] = (feature_engineer_df.groupby('schedule_season')['team_straight_up_win'].cumsum() / feature_engineer_df['team_game_count']).round(5)
    # replace week's 1-3 with last season's win percentage averaged in
    feature_engineer_df['team_win_pct'] = np.select([feature_engineer_df['schedule_week'] == 1,
                                                     feature_engineer_df['schedule_week'] == 2,
                                                     feature_engineer_df['schedule_week'] == 3,
                                                     feature_engineer_df['schedule_week'] >= 4],
                                                    [feature_engineer_df['team_win_pct_placeholder'].shift(),
                                                     feature_engineer_df.groupby('team')['team_win_pct_placeholder'].transform(lambda x: x.shift().rolling(window=2).mean()).round(5),
                                                     feature_engineer_df.groupby('team')['team_win_pct_placeholder'].transform(lambda x: x.shift().rolling(window=3).mean()).round(5),
                                                     feature_engineer_df['team_win_pct_placeholder']])
    # selected team against the spread
    feature_engineer_df['team_ats_win'] = np.where(((feature_engineer_df['team_favored']==team) &
                                                    (feature_engineer_df['opponent_score'] - feature_engineer_df['score'] < feature_engineer_df['spread_favorite'])) |
                                                   ((feature_engineer_df['team_favored']!=team) &
                                                    (feature_engineer_df['score'] - feature_engineer_df['opponent_score'] > feature_engineer_df['spread_favorite']) |
                                                   ((feature_engineer_df['team_favored']=='PICK') &
                                                    (feature_engineer_df['score']) > feature_engineer_df['opponent_score'])),
                                                  1,0)
    
    # calculate team's ATS percentage
    # df = df.rolling(4).apply(lambda x: (weights*x).sum())
    feature_engineer_df['team_ats_pct_placeholder'] = (feature_engineer_df.groupby('schedule_season')['team_ats_win'].cumsum() / feature_engineer_df['team_game_count']).round(5)
    feature_engineer_df['team_ats_pct'] = np.select([feature_engineer_df['schedule_week'] == 1,
                                                     feature_engineer_df['schedule_week'] == 2,
                                                     feature_engineer_df['schedule_week'] == 3,
                                                     feature_engineer_df['schedule_week'] >= 4],
                                                    [feature_engineer_df['team_ats_pct_placeholder'].shift(),
                                                     feature_engineer_df.groupby('team')['team_ats_pct_placeholder'].transform(lambda x: x.shift().rolling(window=2).mean()).round(5),
                                                     feature_engineer_df.groupby('team')['team_ats_pct_placeholder'].transform(lambda x: x.shift().rolling(window=3).mean()).round(5),
                                                     feature_engineer_df['team_ats_pct_placeholder']])
    # selected team over pct
    feature_engineer_df['team_over_pct_placeholder'] = (feature_engineer_df.groupby('schedule_season')['over_under'].cumsum() / feature_engineer_df['team_game_count']).round(5)
    feature_engineer_df['team_over_pct'] = np.select([feature_engineer_df['schedule_week'] == 1,
                                                      feature_engineer_df['schedule_week'] == 2,
                                                      feature_engineer_df['schedule_week'] == 3,
                                                      feature_engineer_df['schedule_week'] >= 4],
                                                     [feature_engineer_df['team_over_pct_placeholder'].shift(),
                                                      feature_engineer_df.groupby('team')['team_over_pct_placeholder'].transform(lambda x: x.shift().rolling(window=2).mean()).round(5),
                                                      feature_engineer_df.groupby('team')['team_over_pct_placeholder'].transform(lambda x: x.shift().rolling(window=3).mean()).round(5),
                                                      feature_engineer_df['team_over_pct_placeholder']])
    # selected team average +/- against spread - more positive is better
    feature_engineer_df['team_score_ats'] = np.select([(feature_engineer_df['team_favored']==team) & (feature_engineer_df['team_ats_win']==1),
                                                       (feature_engineer_df['team_favored']==team) & (feature_engineer_df['team_straight_up_win']==0),
                                                       (feature_engineer_df['team_favored']==team) & (feature_engineer_df['team_ats_win']==0),
                                                       (feature_engineer_df['team_favored']!=team) & (feature_engineer_df['team_straight_up_win']==1),
                                                       (feature_engineer_df['team_favored']!=team) & (feature_engineer_df['team_ats_win']==1),
                                                       (feature_engineer_df['team_favored']!=team) & (feature_engineer_df['team_ats_win']==0)],
                                                     [feature_engineer_df['spread_favorite'] + feature_engineer_df['score'] - feature_engineer_df['opponent_score'],
                                                      feature_engineer_df['spread_favorite'] - (feature_engineer_df['opponent_score'] - feature_engineer_df['score']),
                                                      feature_engineer_df['spread_favorite'] + feature_engineer_df['score'] - feature_engineer_df['opponent_score'],
                                                      abs(feature_engineer_df['spread_favorite']) + feature_engineer_df['score'] - feature_engineer_df['opponent_score'],
                                                      abs(feature_engineer_df['spread_favorite']) - feature_engineer_df['opponent_score'] + feature_engineer_df['score'],
                                                      abs(feature_engineer_df['spread_favorite']) - feature_engineer_df['opponent_score'] + feature_engineer_df['score']])
    feature_engineer_df['team_cum_sum_score_ats'] = feature_engineer_df.groupby('schedule_season')['team_score_ats'].cumsum()
    feature_engineer_df['team_avg_score_ats'] = np.where(feature_engineer_df['schedule_week'] <= 3,
                                                     feature_engineer_df.groupby('team')['team_score_ats'].transform(lambda x: x.shift().rolling(window=6).mean()).round(),
                                                     (feature_engineer_df['team_cum_sum_score_ats'] / feature_engineer_df['team_game_count']).round().shift(1)).astype('int64')
    # drop unneeded columns
    feature_engineer_df = feature_engineer_df.drop(columns=['team_straight_up_win','team_cum_sum_score','team_cum_sum_pts_allowed','team_game_count','team_win_pct_placeholder',
                                                           'team_ats_pct_placeholder','team_ats_win','team_over_pct_placeholder','team_score_ats','team_cum_sum_score_ats'])
    # append each team's dataframe
    appended_df = appended_df.append(feature_engineer_df)

appended_df = appended_df.reset_index(drop=True)
appended_df = appended_df.drop(columns=['opponent','score','opponent_score'])
appended_df = appended_df.loc[:, ~appended_df.columns.isin(nfl_df.columns.to_list()[1:])]

# combine engineered features to original data
home_df = pd.merge(nfl_df[['schedule_date','team_home','score_home','score_away','team_away','spread_favorite','over_under_line']],
                    appended_df.loc[appended_df.home_or_away==1],
                    left_on=['schedule_date','team_home'], right_on=['schedule_date','team'],
                    how='outer').rename(columns={'team_score_avg':'home_team_score_avg',
                                                 'team_pts_allowed_avg':'home_team_pts_allowed_avg',
                                                 'team_win_pct':'home_team_win_pct',
                                                 'team_ats_pct':'home_team_ats_pct',
                                                 'team_over_pct':'home_team_over_pct',
                                                 'team_avg_score_ats':'home_team_avg_score_ats'}).drop(columns='team')
final_df = pd.merge(home_df,
                    appended_df.loc[appended_df.home_or_away==0].filter(items=['schedule_date','team','team_score_avg','team_pts_allowed_avg','team_win_pct','team_ats_pct','team_over_pct','team_avg_score_ats']),
                    left_on=['schedule_date','team_away'], right_on=['schedule_date','team'],
                    how='outer').rename(columns={'team_score_avg':'away_team_score_avg',
                                                 'team_pts_allowed_avg':'away_team_pts_allowed_avg',
                                                 'team_win_pct':'away_team_win_pct',
                                                 'team_ats_pct':'away_team_ats_pct',
                                                 'team_over_pct':'away_team_over_pct',
                                                 'team_avg_score_ats':'away_team_avg_score_ats'}).drop(columns=['team','home_or_away'])

print(final_df.shape)
final_df.head()

(5113, 23)


Unnamed: 0,schedule_date,team_home,score_home,score_away,team_away,spread_favorite,over_under_line,home_team_favored,home_team_win,home_team_covers,over_under,home_team_score_avg,home_team_pts_allowed_avg,home_team_win_pct,home_team_ats_pct,home_team_over_pct,home_team_avg_score_ats,away_team_score_avg,away_team_pts_allowed_avg,away_team_win_pct,away_team_ats_pct,away_team_over_pct,away_team_avg_score_ats
0,2001-09-09,Baltimore Ravens,17.0,6.0,Chicago Bears,-10.5,33.5,1,1,1,0,-9223372036854775808,-9223372036854775808,,,,-9223372036854775808,-9223372036854775808,-9223372036854775808,,,,-9223372036854775808
1,2001-09-23,Cincinnati Bengals,21.0,10.0,Baltimore Ravens,-7.0,33.5,0,1,1,0,-9223372036854775808,-9223372036854775808,,,,-9223372036854775808,-9223372036854775808,-9223372036854775808,,,,-9223372036854775808
2,2001-10-07,Baltimore Ravens,26.0,7.0,Tennessee Titans,-3.5,33.5,1,1,1,0,16,13,0.75,0.75,0.0,-2,14,22,0.0,0.0,0.33333,-10
3,2001-10-21,Cleveland Browns,24.0,14.0,Baltimore Ravens,-7.5,33.0,0,1,1,1,17,15,0.66667,0.83333,0.83333,6,19,16,0.5,0.5,0.33333,0
4,2001-10-28,Baltimore Ravens,18.0,17.0,Jacksonville Jaguars,-7.5,33.0,1,1,0,1,18,17,0.57143,0.42857,0.42857,-3,15,14,0.33333,0.5,0.5,-4


In [11]:
# create data to train the model on
data_df = final_df.loc[(final_df.schedule_date>=training_start_date) & (final_df.schedule_date<=training_end_date)].drop(columns='schedule_date')
data_df = data_df[['team_home','team_away','score_home','score_away','home_team_win','home_team_covers','over_under','spread_favorite','over_under_line','home_team_favored',
                   'home_team_score_avg','home_team_pts_allowed_avg','home_team_win_pct','home_team_ats_pct','home_team_over_pct','home_team_avg_score_ats','away_team_score_avg','away_team_pts_allowed_avg',
                   'away_team_win_pct','away_team_ats_pct','away_team_over_pct','away_team_avg_score_ats']]
data_df.head()

Unnamed: 0,team_home,team_away,score_home,score_away,home_team_win,home_team_covers,over_under,spread_favorite,over_under_line,home_team_favored,home_team_score_avg,home_team_pts_allowed_avg,home_team_win_pct,home_team_ats_pct,home_team_over_pct,home_team_avg_score_ats,away_team_score_avg,away_team_pts_allowed_avg,away_team_win_pct,away_team_ats_pct,away_team_over_pct,away_team_avg_score_ats
16,Baltimore Ravens,Cleveland Browns,33.0,13.0,1,1,1,-2.5,39.0,1,22,27,0.21875,0.3125,0.75,-4,20,20,0.2647,0.35294,0.23529,3
17,Arizona Cardinals,Baltimore Ravens,18.0,26.0,0,0,1,-6.5,37.0,0,13,31,0.16667,0.16667,0.5,-11,20,18,0.6,0.6,0.6,4
18,Cincinnati Bengals,Baltimore Ravens,34.0,26.0,1,1,1,-1.0,36.0,0,15,21,0.33333,0.66667,0.16667,1,22,18,0.5,0.5,0.66667,3
19,Baltimore Ravens,Denver Broncos,26.0,6.0,1,1,0,-2.5,36.5,1,22,21,0.57143,0.57143,0.57143,1,25,16,0.625,0.5,0.375,5
20,Baltimore Ravens,Jacksonville Jaguars,24.0,17.0,1,0,1,-7.0,37.5,1,23,19,0.625,0.5,0.625,3,18,26,0.125,0.25,0.75,-6


In [12]:
# create data to predict
pred_df = final_df.loc[(final_df.schedule_date>=prediction_start_date) & (final_df.schedule_date<=prediction_end_date)]
pred_df = pred_df[['team_home','team_away','spread_favorite','over_under_line','home_team_favored','home_team_score_avg','home_team_pts_allowed_avg','home_team_win_pct',
            'home_team_ats_pct','home_team_over_pct','home_team_avg_score_ats','away_team_score_avg','away_team_pts_allowed_avg','away_team_win_pct','away_team_ats_pct','away_team_over_pct','away_team_avg_score_ats']]
pred_df

Unnamed: 0,team_home,team_away,spread_favorite,over_under_line,home_team_favored,home_team_score_avg,home_team_pts_allowed_avg,home_team_win_pct,home_team_ats_pct,home_team_over_pct,home_team_avg_score_ats,away_team_score_avg,away_team_pts_allowed_avg,away_team_win_pct,away_team_ats_pct,away_team_over_pct,away_team_avg_score_ats
196,Baltimore Ravens,Kansas City Chiefs,-3.5,53.5,1,31,16,0.94118,0.86275,0.17647,6,34,23,0.92982,0.72807,0.17544,4
398,New Orleans Saints,Green Bay Packers,-3.0,52.5,1,32,22,0.7549,0.71569,0.84314,3,30,24,0.92593,0.87037,0.81481,4
631,Seattle Seahawks,Dallas Cowboys,-5.5,56.5,1,24,24,0.88889,0.81481,0.83333,0,30,24,0.33333,0.1875,0.375,3
761,Denver Broncos,Tampa Bay Buccaneers,-6.5,43.5,0,20,20,0.14583,0.85417,0.3125,2,29,26,0.3125,0.27083,0.91667,2
1091,Jacksonville Jaguars,Miami Dolphins,-3.0,48.5,1,23,26,0.625,0.8125,0.83333,2,24,28,0.10417,0.35417,0.35417,3
1563,Indianapolis Colts,New York Jets,-11.0,43.5,1,25,26,0.3125,0.3125,0.6875,-3,17,23,0.14583,0.14583,0.8125,-1
1888,Los Angeles Chargers,Carolina Panthers,-6.5,43.5,1,22,23,0.60417,0.75,0.14583,-1,18,36,0.10417,0.125,0.89583,-11
2031,New York Giants,San Francisco 49ers,-4.0,41.5,0,23,26,0.08333,0.3125,0.20833,1,27,20,0.42982,0.35965,0.32456,2
2178,Minnesota Vikings,Tennessee Titans,-2.5,48.5,0,18,27,0.2037,0.18519,0.66667,-6,26,20,0.85965,0.19298,0.35965,7
2546,Atlanta Falcons,Chicago Bears,-3.0,47.5,1,31,26,0.14583,0.33333,0.8125,6,19,21,0.83333,0.58333,0.625,-1


# Develop Models

## Linear Regression to Predict Scores

In [13]:
X = data_df.iloc[:,7:]
y = data_df['score_home']

# Fit linear regression
home_score_model = LinearRegression().fit(X, y)

In [14]:
# get model coefficients
pd.concat([pd.DataFrame(data_df.iloc[:,7:].columns, columns=['featues']),pd.DataFrame(np.transpose(home_score_model.coef_), columns=['coeff'])], axis = 1).sort_values(by='coeff', ascending=False)

Unnamed: 0,featues,coeff
7,home_team_over_pct,10.892924
13,away_team_over_pct,9.738771
5,home_team_win_pct,7.777172
6,home_team_ats_pct,7.248196
2,home_team_favored,1.970504
1,over_under_line,0.50768
14,away_team_avg_score_ats,0.320837
0,spread_favorite,-0.042775
3,home_team_score_avg,-0.069839
10,away_team_pts_allowed_avg,-0.07919


In [15]:
y = data_df['score_away']

# Fit linear regression
away_score_model = LinearRegression().fit(X, y)

In [16]:
# get model coefficients
pd.concat([pd.DataFrame(data_df.iloc[:,7:].columns, columns=['featues']),pd.DataFrame(np.transpose(away_score_model.coef_), columns=['coeff'])], axis = 1).sort_values(by='coeff', ascending=False)

Unnamed: 0,featues,coeff
7,home_team_over_pct,9.427817
13,away_team_over_pct,9.411858
12,away_team_ats_pct,8.630354
11,away_team_win_pct,3.934482
1,over_under_line,0.544273
8,home_team_avg_score_ats,0.275962
0,spread_favorite,0.081069
9,away_team_score_avg,0.054305
4,home_team_pts_allowed_avg,-0.138366
3,home_team_score_avg,-0.276669


## Logistic Regression

In [17]:
y = data_df['home_team_win']

scaler = StandardScaler().fit(X)
X = scaler.transform(X)

# Fit linear regression
home_team_win_model = LogisticRegression().fit(X, y)

In [18]:
# get model coefficients
pd.concat([pd.DataFrame(data_df.iloc[:,7:].columns, columns=['featues']),pd.DataFrame(np.transpose(home_team_win_model.coef_), columns=['coeff'])], axis = 1).sort_values(by='coeff', ascending=False)

Unnamed: 0,featues,coeff
5,home_team_win_pct,1.187883
14,away_team_avg_score_ats,0.390274
6,home_team_ats_pct,0.251358
4,home_team_pts_allowed_avg,0.21701
2,home_team_favored,0.146904
9,away_team_score_avg,0.11299
7,home_team_over_pct,0.086337
1,over_under_line,0.00967
13,away_team_over_pct,-0.029381
0,spread_favorite,-0.078045


In [19]:
y = data_df['home_team_covers']

# Fit linear regression
home_team_covers_model = LogisticRegression().fit(X, y)

In [20]:
# get model coefficients
pd.concat([pd.DataFrame(data_df.iloc[:,7:].columns, columns=['featues']),pd.DataFrame(np.transpose(home_team_covers_model.coef_), columns=['coeff'])], axis = 1).sort_values(by='coeff', ascending=False)

Unnamed: 0,featues,coeff
6,home_team_ats_pct,0.858299
14,away_team_avg_score_ats,0.581097
5,home_team_win_pct,0.324477
0,spread_favorite,0.055099
4,home_team_pts_allowed_avg,0.035092
7,home_team_over_pct,0.033237
13,away_team_over_pct,0.024433
9,away_team_score_avg,0.023907
10,away_team_pts_allowed_avg,0.007412
3,home_team_score_avg,-0.033222


In [21]:
y = data_df['over_under']

# Fit linear regression
over_under_model = LogisticRegression().fit(X, y)

In [22]:
# get model coefficients
pd.concat([pd.DataFrame(data_df.iloc[:,7:].columns, columns=['featues']),pd.DataFrame(np.transpose(over_under_model.coef_), columns=['coeff'])], axis = 1).sort_values(by='coeff', ascending=False)

Unnamed: 0,featues,coeff
7,home_team_over_pct,0.785746
13,away_team_over_pct,0.759918
1,over_under_line,0.104287
0,spread_favorite,0.040252
5,home_team_win_pct,0.031811
6,home_team_ats_pct,0.015873
2,home_team_favored,-0.000235
12,away_team_ats_pct,-0.006414
8,home_team_avg_score_ats,-0.00842
14,away_team_avg_score_ats,-0.083288


# Predictions

In [23]:
# scale features for logistic models
scaler = StandardScaler().fit(pred_df.iloc[:,2:])
scaled_pred_df = scaler.transform(pred_df.iloc[:,2:])
# produce predictions
pred_df['pred_score_home'] = home_score_model.predict(pred_df.iloc[:,2:])
pred_df['pred_score_away'] = away_score_model.predict(pred_df.iloc[:,2:-1])
pred_df['pred_home_team_win'] = home_team_win_model.predict(scaled_pred_df)
pred_df['pred_home_team_covers'] = home_team_covers_model.predict(scaled_pred_df)
pred_df['pred_over_under'] = over_under_model.predict(scaled_pred_df)
# filter dataframe and merge with final_df to get line info
pred_df = pred_df[['team_home','team_away','pred_score_home','pred_score_away','pred_home_team_win','pred_home_team_covers','pred_over_under']]
pred_df = pd.merge(pred_df, final_df[['home_team_favored','spread_favorite','over_under_line']], how='inner', left_index=True, right_index=True)
# create flags to identify if the score models think the home team wins - wins = 1, loses = 0
pred_df['score_home_team_win'] = np.where(pred_df['pred_score_home'] > pred_df['pred_score_away'], 1, 0)
# create flag to identify if the score models think the home team covers - covers = 1, doesn't cover = 0
pred_df['score_home_team_covers'] = np.select([(pred_df['home_team_favored']==1) &
                                               (pred_df['pred_score_away'] - pred_df['pred_score_home'] < pred_df['spread_favorite']),
                                             (pred_df['home_team_favored']==1) &
                                               (pred_df['pred_score_away'] - pred_df['pred_score_home'] > pred_df['spread_favorite']),
                                             (pred_df['home_team_favored']==0) &
                                               (pred_df['pred_score_home'] - pred_df['pred_score_away'] < pred_df['spread_favorite']),
                                             (pred_df['home_team_favored']==0) &
                                               (pred_df['pred_score_home'] - pred_df['pred_score_away'] > pred_df['spread_favorite']),
                                             (pred_df['spread_favorite']==0) & (pred_df['pred_score_home']>pred_df['pred_score_away']),
                                             (pred_df['spread_favorite']==0) & (pred_df['pred_score_home']<pred_df['pred_score_away']),
                                             (pred_df['spread_favorite']==0) & (pred_df['pred_score_home']==pred_df['pred_score_away'])],
                                            [1,0,0,1,1,0,0])
# create flag to identify if the score models think the over hits - over = 1, under = 0
pred_df['score_over_under'] = np.where(pred_df['pred_score_home'] + pred_df['pred_score_away'] > pred_df['over_under_line'], 1, 0)
# create temporary flags to identify if the logistic models agree with the score models
pred_df['model_win_agree_flag'] = np.where(pred_df['pred_home_team_win'] == pred_df['score_home_team_win'], 1, 0)
pred_df['model_home_team_cover_flag'] = np.where(pred_df['pred_home_team_covers'] == pred_df['score_home_team_covers'], 1, 0)
pred_df['model_over_under_flag'] = np.where(pred_df['pred_over_under'] == pred_df['score_over_under'], 1, 0)
# create flag to identify if both the homte team win and home team covers logistic models agree with the score models - agrees = 1, disagrees = 0
pred_df['model_win_cover_agree_flag'] = np.where(pred_df['model_win_agree_flag'] == pred_df['model_home_team_cover_flag'], 1, 0)
# create percentage of logistic models that agree with the score models
pred_df['model_agree_pct'] = (pred_df['model_win_agree_flag'] + pred_df['model_home_team_cover_flag'] + pred_df['model_over_under_flag']) / 3
# filter dataframe
pred_df = pred_df[['team_away','team_home','pred_score_away','pred_score_home','pred_home_team_win','score_home_team_win','pred_home_team_covers','score_home_team_covers',
                   'pred_over_under','score_over_under','home_team_favored','spread_favorite','over_under_line','model_win_cover_agree_flag','model_agree_pct']]
pred_df

Unnamed: 0,team_away,team_home,pred_score_away,pred_score_home,pred_home_team_win,score_home_team_win,pred_home_team_covers,score_home_team_covers,pred_over_under,score_over_under,home_team_favored,spread_favorite,over_under_line,model_win_cover_agree_flag,model_agree_pct
196,Kansas City Chiefs,Baltimore Ravens,16.654433,21.779882,0,1,0,1,0,0,1,-3.5,53.5,1,0.333333
398,Green Bay Packers,New Orleans Saints,29.600706,31.118514,0,1,0,0,1,1,1,-3.0,52.5,0,0.666667
631,Dallas Cowboys,Seattle Seahawks,18.85244,40.281875,1,1,1,1,1,1,1,-5.5,56.5,1,1.0
761,Tampa Bay Buccaneers,Denver Broncos,21.237576,25.889428,0,1,1,1,1,1,0,-6.5,43.5,0,0.666667
1091,Miami Dolphins,Jacksonville Jaguars,15.810151,33.969105,1,1,1,1,1,1,1,-3.0,48.5,1,1.0
1563,New York Jets,Indianapolis Colts,20.699119,31.993199,1,1,0,1,1,1,1,-11.0,43.5,0,0.666667
1888,Carolina Panthers,Los Angeles Chargers,11.505994,28.564983,1,1,1,1,0,0,1,-6.5,43.5,1,1.0
2031,San Francisco 49ers,New York Giants,19.509561,11.374708,0,0,0,0,0,0,0,-4.0,41.5,1,1.0
2178,Tennessee Titans,Minnesota Vikings,25.725552,22.753099,0,0,1,0,1,0,0,-2.5,48.5,0,0.333333
2546,Chicago Bears,Atlanta Falcons,32.300352,20.955205,0,0,0,0,1,1,1,-3.0,47.5,1,1.0


In [169]:
# get probabilites from the logistic models and merge with line info and predicted scores
probs_df = pd.DataFrame(home_team_win_model.predict_proba(scaled_pred_df)[:,1], columns=['home_team_win_probs'], index=pred_df.index)
probs_df = pd.merge(probs_df, pd.DataFrame(home_team_covers_model.predict_proba(scaled_pred_df)[:,1], columns=['home_team_covers_probs'], index=pred_df.index), left_index=True, right_index=True)
probs_df = pd.merge(probs_df, pd.DataFrame(over_under_model.predict_proba(scaled_pred_df)[:,1], columns=['over_under_probs'], index=pred_df.index), left_index=True, right_index=True)
probs_df = pd.merge(pred_df[['team_away','team_home','pred_score_away','pred_score_home','home_team_favored','spread_favorite','over_under_line']], probs_df, left_index=True, right_index=True)
probs_df

Unnamed: 0,team_away,team_home,pred_score_away,pred_score_home,home_team_favored,spread_favorite,over_under_line,home_team_win_probs,home_team_covers_probs,over_under_probs
196,Kansas City Chiefs,Baltimore Ravens,16.654433,21.779882,1,-3.5,53.5,0.482741,0.397159,0.099419
398,Green Bay Packers,New Orleans Saints,29.600706,31.118514,1,-3.0,52.5,0.332601,0.291689,0.691628
631,Dallas Cowboys,Seattle Seahawks,18.85244,40.281875,1,-5.5,56.5,0.969806,0.879395,0.614319
761,Tampa Bay Buccaneers,Denver Broncos,21.237576,25.889428,0,-6.5,43.5,0.450884,0.782097,0.686739
1091,Miami Dolphins,Jacksonville Jaguars,15.810151,33.969105,1,-3.0,48.5,0.935733,0.768269,0.515963
1563,New York Jets,Indianapolis Colts,20.699119,31.993199,1,-11.0,43.5,0.733523,0.456345,0.793844
1888,Carolina Panthers,Los Angeles Chargers,11.505994,28.564983,1,-6.5,43.5,0.747799,0.517308,0.397519
2031,San Francisco 49ers,New York Giants,19.509561,11.374708,0,-4.0,41.5,0.234069,0.346093,0.188853
2178,Tennessee Titans,Minnesota Vikings,25.725552,22.753099,0,-2.5,48.5,0.336543,0.741979,0.581939
2546,Chicago Bears,Atlanta Falcons,32.300352,20.955205,1,-3.0,47.5,0.037977,0.050412,0.65224


In [180]:
# highest probability for spoilers
spoiler_threshold = .7
probs_df.loc[((probs_df.home_team_favored==1) & (probs_df.home_team_win_probs<=1-spoiler_threshold)) | ((probs_df.home_team_favored==0) & (probs_df.home_team_win_probs>=spoiler_threshold))]

Unnamed: 0,team_away,team_home,pred_score_away,pred_score_home,home_team_favored,spread_favorite,over_under_line,home_team_win_probs,home_team_covers_probs,over_under_probs
2546,Chicago Bears,Atlanta Falcons,32.300352,20.955205,1,-3.0,47.5,0.037977,0.050412,0.65224


In [187]:
# highest probabilites of home team covering/not covering
cover_threshold = .75
probs_df.loc[(probs_df.home_team_covers_probs>=cover_threshold) | (probs_df.home_team_covers_probs<=1-cover_threshold)].sort_values(by='home_team_covers_probs')

Unnamed: 0,team_away,team_home,pred_score_away,pred_score_home,home_team_favored,spread_favorite,over_under_line,home_team_win_probs,home_team_covers_probs,over_under_probs
2546,Chicago Bears,Atlanta Falcons,32.300352,20.955205,1,-3.0,47.5,0.037977,0.050412,0.65224
4778,Washington Redskins,Cleveland Browns,19.422945,17.418044,1,-7.0,44.5,0.328956,0.192002,0.164074
3286,Cincinnati Bengals,Philadelphia Eagles,22.167784,26.116325,1,-4.5,46.5,0.573917,0.238147,0.567857
1091,Miami Dolphins,Jacksonville Jaguars,15.810151,33.969105,1,-3.0,48.5,0.935733,0.768269,0.515963
761,Tampa Bay Buccaneers,Denver Broncos,21.237576,25.889428,0,-6.5,43.5,0.450884,0.782097,0.686739
631,Dallas Cowboys,Seattle Seahawks,18.85244,40.281875,1,-5.5,56.5,0.969806,0.879395,0.614319


In [184]:
# highest probabilities of over/under hitting
over_under_threshold = 0.7
probs_df.loc[(probs_df.over_under_probs>=over_under_threshold) | (probs_df.over_under_probs<=1-over_under_threshold)].sort_values(by='over_under_probs')

Unnamed: 0,team_away,team_home,pred_score_away,pred_score_home,home_team_favored,spread_favorite,over_under_line,home_team_win_probs,home_team_covers_probs,over_under_probs
196,Kansas City Chiefs,Baltimore Ravens,16.654433,21.779882,1,-3.5,53.5,0.482741,0.397159,0.099419
4778,Washington Redskins,Cleveland Browns,19.422945,17.418044,1,-7.0,44.5,0.328956,0.192002,0.164074
2031,San Francisco 49ers,New York Giants,19.509561,11.374708,0,-4.0,41.5,0.234069,0.346093,0.188853
3527,Houston Texans,Pittsburgh Steelers,6.168038,26.82478,1,-4.0,44.5,0.9213,0.746651,0.234739
1563,New York Jets,Indianapolis Colts,20.699119,31.993199,1,-11.0,43.5,0.733523,0.456345,0.793844


# Prediction Performance (only use for games already played with actual score inputs)

In [170]:
# join predictions back to actual data
performance_df = pd.merge(pred_df, final_df[['score_home','score_away','home_team_win','over_under','home_team_covers']], how='inner', left_index=True, right_index=True).dropna()
# flag if model predicts dog to straight up win
performance_df['pred_spoiler_alert_flag'] = np.where(((performance_df['home_team_favored'] == 1) & ((performance_df['pred_score_home'] < performance_df['pred_score_away']) |
                                                                                                   (performance_df['pred_home_team_win']==0))) |
                                                      ((performance_df['home_team_favored'] == 0) & ((performance_df['pred_score_home'] > performance_df['pred_score_away']) |
                                                                                                    (performance_df['pred_home_team_win']==1))),
                                                     1,0)
# flag if dog wins straight up - dog wins = 1, dog loses = 1
performance_df['dog_straight_up_win_flag'] = np.where(((performance_df['home_team_favored'] == 1) & (performance_df['score_home'] < performance_df['score_away'])) |
                                                      ((performance_df['home_team_favored'] == 0) & (performance_df['score_home'] > performance_df['score_away'])),
                                                     1,0)
# flag if score models predicted the correct team to cover - correct = 1, incorrect = 0
performance_df['team_cover_correct'] = np.where(((performance_df['home_team_favored']==1) & (performance_df['home_team_covers']==1) &
                                                 (performance_df['pred_score_home'] > performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_away'] - performance_df['pred_score_home'] < performance_df['spread_favorite'])) |
                                                ((performance_df['home_team_favored']==1) & (performance_df['home_team_covers']==0) &
                                                ((performance_df['pred_score_home'] < performance_df['pred_score_away'])) |
                                                 (((performance_df['pred_score_home'] > performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_away'] - performance_df['pred_score_home'] > performance_df['spread_favorite'])))) |
                                                ((performance_df['home_team_favored']==0) & (performance_df['home_team_covers']==0) &
                                                 (performance_df['pred_score_home'] < performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_home'] - performance_df['pred_score_away'] < performance_df['spread_favorite'])) |
                                                ((performance_df['home_team_favored']==0) & (performance_df['home_team_covers']==1) &
                                                 ((performance_df['pred_score_home'] > performance_df['pred_score_away']) |
                                                 ((performance_df['pred_score_home'] < performance_df['pred_score_away']) &
                                                 (performance_df['pred_score_home'] - performance_df['pred_score_away'] > performance_df['spread_favorite'])))) |
                                                ((performance_df['spread_favorite']==0) &
                                                 ((performance_df['pred_score_home'] > performance_df['pred_score_away']) & (performance_df['score_home'] > performance_df['score_away']) |
                                                 (performance_df['pred_score_home'] < performance_df['pred_score_away']) & (performance_df['score_home'] < performance_df['score_away']))),
                                                1,0)
# flag if score models accurately predicted the straight up winner - correct = 1, incorrect = 0
performance_df['team_win_correct'] = np.where(((performance_df['pred_score_home'] > performance_df['pred_score_away']) & (performance_df['score_home'] > performance_df['score_away'])) |
                                              ((performance_df['pred_score_home'] < performance_df['pred_score_away']) & (performance_df['score_home'] < performance_df['score_away'])),
                                              1,0)
# flag if score models accurately predicted the over - correct = 1, incorrect = 0
performance_df['over_under_correct'] = np.where(((performance_df['pred_score_home'] + performance_df['pred_score_away'] > performance_df['over_under_line']) &
                                                (performance_df['score_home'] + performance_df['score_away'] > performance_df['over_under_line'])) |
                                               ((performance_df['pred_score_home'] + performance_df['pred_score_away'] < performance_df['over_under_line']) &
                                                (performance_df['score_home'] + performance_df['score_away'] < performance_df['over_under_line'])),
                                               1,0)
# flag logistic models accurate predictions - correct = 1, incorrect = 0
performance_df['log_team_covers_correct'] = np.where(performance_df['pred_home_team_covers']==performance_df['home_team_covers'],1,0)
performance_df['log_team_win_correct'] = np.where(performance_df['pred_home_team_win']==performance_df['home_team_win'],1,0)
performance_df['log_over_under_correct'] = np.where(performance_df['pred_over_under']==performance_df['over_under'],1,0)

print('team_cover_pct:',(performance_df['team_cover_correct'].sum() / len(performance_df)).round(4)*100,
      'team_win_pct:',(performance_df['team_win_correct'].sum() / len(performance_df)).round(4)*100,
      'over_under_pct:',(performance_df['over_under_correct'].sum() / len(performance_df)).round(4)*100,
      'log_team_cover_pct:',(performance_df['log_team_covers_correct'].sum() / len(performance_df)).round(4)*100,
      'log_team_win_pct:',(performance_df['log_team_win_correct'].sum() / len(performance_df)).round(4)*100,
      'log_over_under_pct:',(performance_df['log_over_under_correct'].sum() / len(performance_df)).round(4)*100,
      'spoiler_alert_pct:',(len(performance_df.loc[(performance_df.pred_spoiler_alert_flag==1) & (performance_df.dog_straight_up_win_flag==1)]) / performance_df['dog_straight_up_win_flag'].sum()).round(4)*100)

performance_df.head()

team_cover_pct: nan team_win_pct: nan over_under_pct: nan log_team_cover_pct: nan log_team_win_pct: nan log_over_under_pct: nan spoiler_alert_pct: nan




Unnamed: 0,team_away,team_home,pred_score_away,pred_score_home,pred_home_team_win,score_home_team_win,pred_home_team_covers,score_home_team_covers,pred_over_under,score_over_under,home_team_favored,spread_favorite,over_under_line,model_win_cover_agree_flag,model_agree_pct,score_home,score_away,home_team_win,over_under,home_team_covers,pred_spoiler_alert_flag,dog_straight_up_win_flag,team_cover_correct,team_win_correct,over_under_correct,log_team_covers_correct,log_team_win_correct,log_over_under_correct


In [93]:
predictions_df = pd.merge(probs_df, performance_df, on=['team_away','team_home','pred_score_away','pred_score_home','home_team_favored','spread_favorite','over_under_line'])
# predictions_df.to_csv('predictions.csv', index=False)

In [118]:
probs_performance_df = predictions_df[['team_away','team_home','pred_score_away','pred_score_home','score_away','score_home','home_team_favored','spread_favorite','over_under_line',
                                       'home_team_win_probs','home_team_win','home_team_covers_probs','home_team_covers','over_under_probs','over_under','score_home_team_win',
                                       'score_home_team_covers','score_over_under','pred_spoiler_alert_flag']]
probs_performance_df.head()

Unnamed: 0,team_away,team_home,pred_score_away,pred_score_home,score_away,score_home,home_team_favored,spread_favorite,over_under_line,home_team_win_probs,home_team_win,home_team_covers_probs,home_team_covers,over_under_probs,over_under,score_home_team_win,score_home_team_covers,score_over_under,pred_spoiler_alert_flag
0,Cleveland Browns,Baltimore Ravens,17.080797,30.032625,6.0,38.0,1,-7.0,47.5,0.833776,1,0.521414,1,0.525328,0,1,1,0,0
1,Baltimore Ravens,Houston Texans,26.776962,10.472996,33.0,16.0,0,-7.0,49.5,0.148306,0,0.173743,0,0.112177,0,0,0,0,0
2,Tampa Bay Buccaneers,New Orleans Saints,16.43668,31.339353,26.0,34.0,1,-4.0,47.5,0.853031,1,0.752846,1,0.433164,1,1,1,1,0
3,New Orleans Saints,Las Vegas Raiders,26.702997,26.443813,24.0,34.0,0,-5.5,48.5,0.703056,1,0.794452,1,0.677613,1,0,1,1,1
4,Seattle Seahawks,Atlanta Falcons,26.426347,19.958194,38.0,25.0,0,-1.0,49.5,0.121563,0,0.204672,0,0.440003,1,0,0,0,0


In [146]:
cover_threshold = 0.75
over_under_threshold = 0.7

print(f'{cover_threshold*100}% cover win percent:',
      len(probs_performance_df.loc[((probs_performance_df.home_team_covers_probs>=cover_threshold) & (probs_performance_df.home_team_covers==1)) |
                                   ((probs_performance_df.home_team_covers_probs<=1-cover_threshold) & (probs_performance_df.home_team_covers==0))]) / \
      len(probs_performance_df.loc[(probs_performance_df.home_team_covers_probs >= cover_threshold) | (probs_performance_df.home_team_covers_probs <= 1-cover_threshold)]),
      f'\n{over_under_threshold*100}% over/under win percent:',
      len(probs_performance_df.loc[((probs_performance_df.over_under_probs>=over_under_threshold) & (probs_performance_df.over_under==1)) |
                                   ((probs_performance_df.over_under_probs<=1-over_under_threshold) & (probs_performance_df.over_under==0))]) / \
      len(probs_performance_df.loc[(probs_performance_df.over_under_probs>=over_under_threshold) | (probs_performance_df.over_under_probs<=1-over_under_threshold)]))

75.0% cover win percent: 0.6923076923076923 
70.0% over/under win percent: 0.8
