# 2024 NFL Season O/U Alg

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import statsmodels.formula.api as smf
from patsy import dmatrices
from xgboost import XGBClassifier
from sklearn import preprocessing, model_selection, metrics
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from matplotlib.legend_handler import HandlerLine2D
from collections import OrderedDict
from sklearn.preprocessing import OneHotEncoder

In [3]:
import pickle

In [4]:
df_model = pd.read_csv('model_data_nfl')

In [5]:
nfl = pd.read_csv('nfl.csv')

In [6]:
nfl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4861 entries, 0 to 4860
Data columns (total 45 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Date                     4861 non-null   object 
 1   Home Team                4861 non-null   object 
 2   Away Team                4861 non-null   object 
 3   Home Score               4861 non-null   int64  
 4   Away Score               4861 non-null   int64  
 5   Overtime?                282 non-null    object 
 6   Playoff Game?            206 non-null    object 
 7   Neutral Venue?           63 non-null     object 
 8   Home Odds Open           4861 non-null   float64
 9   Home Odds Min            2725 non-null   float64
 10  Home Odds Max            2725 non-null   float64
 11  Home Odds Close          2725 non-null   float64
 12  Away Odds Open           4861 non-null   float64
 13  Away Odds Min            2725 non-null   float64
 14  Away Odds Max           

In [7]:
nfl_list = ['Date', 'Home Team', 'Away Team', 'Home Score', 'Away Score', 'Home Line Open', 'Away Line Open', 'Total Score Open']

In [8]:
spread_df = nfl[nfl_list].copy()

In [9]:
spread_df.head()

Unnamed: 0,Date,Home Team,Away Team,Home Score,Away Score,Home Line Open,Away Line Open,Total Score Open
0,2024-02-11,Kansas City Chiefs,San Francisco 49ers,25,22,1.5,-1.5,47.5
1,2024-01-28,San Francisco 49ers,Detroit Lions,34,31,-7.0,7.0,51.5
2,2024-01-28,Baltimore Ravens,Kansas City Chiefs,10,17,-3.5,3.5,44.5
3,2024-01-21,Buffalo Bills,Kansas City Chiefs,24,27,-2.5,2.5,46.0
4,2024-01-21,Detroit Lions,Tampa Bay Buccaneers,31,23,-5.5,5.5,48.5


In [10]:
spread_df['Home Team'].value_counts()

Home Team
New England Patriots        167
Kansas City Chiefs          163
Green Bay Packers           157
Pittsburgh Steelers         156
Indianapolis Colts          156
Seattle Seahawks            155
San Francisco 49ers         155
New Orleans Saints          155
Denver Broncos              155
Philadelphia Eagles         154
Houston Texans              153
Buffalo Bills               153
Baltimore Ravens            153
Dallas Cowboys              152
Atlanta Falcons             151
Tampa Bay Buccaneers        151
Cincinnati Bengals          151
Carolina Panthers           150
Chicago Bears               150
Minnesota Vikings           150
Arizona Cardinals           149
Tennessee Titans            149
Jacksonville Jaguars        148
New York Giants             148
Miami Dolphins              147
Detroit Lions               147
Cleveland Browns            146
New York Jets               146
Washington Redskins         114
Oakland Raiders             112
San Diego Chargers           9

In [11]:
wash_list = ['Washington Redskins', 'Washington Football Team', 'Washington Commanders']

In [12]:
charg_list = ['Los Angeles Chargers', 'San Diego Chargers']

In [13]:
rams_list = ['Los Angeles Rams', 'St. Louis Rams']

In [14]:
raiders_list = ['Las Vegas Raiders', 'Oakland Raiders']

In [15]:
spread_df.loc[spread_df['Home Team'] == 'Washington Redskins', 'Home Team'] = 'Washington Commanders'

In [16]:
spread_df.loc[spread_df['Away Team'] == 'Washington Redskins', 'Away Team'] = 'Washington Commanders'

In [17]:
spread_df.loc[spread_df['Away Team'] == 'Washington Football Team', 'Away Team'] = 'Washington Commanders'

In [18]:
spread_df.loc[spread_df['Home Team'] == 'Washington Football Team', 'Home Team'] = 'Washington Commanders'

In [19]:
spread_df.loc[spread_df['Home Team'] == 'San Diego Chargers', 'Home Team'] = 'Los Angeles Chargers'

In [20]:
spread_df.loc[spread_df['Away Team'] == 'San Diego Chargers', 'Away Team'] = 'Los Angeles Chargers'

In [21]:
spread_df.loc[spread_df['Away Team'] == 'St. Louis Rams', 'Away Team'] = 'Los Angeles Rams'

In [22]:
spread_df.loc[spread_df['Home Team'] == 'St. Louis Rams', 'Home Team'] = 'Los Angeles Rams'

In [23]:
spread_df.loc[spread_df['Home Team'] == 'Oakland Raiders', 'Home Team'] = 'Las Vegas Raiders'

In [24]:
spread_df.loc[spread_df['Away Team'] == 'Oakland Raiders', 'Away Team'] = 'Las Vegas Raiders'

In [25]:
teams_dict = {'New England Patriots' : 'NE', 'Kansas City Chiefs' : 'KC', 'Green Bay Packers' : 'GB', 'Pittsburgh Steelers' : 'PIT', 'Indianapolis Colts' : 'IND', 'New Orleans Saints' : 'NO', 'Seattle Seahawks' : 'SEA', 'Denver Broncos' : 'DEN', 'San Francisco 49ers' : 'SF', 'Philadelphia Eagles' : 'PHI', 'Buffalo Bills' : 'BUF', 'Baltimore Ravens' : 'BAL', 'Houston Texans' : 'HOU', 'Dallas Cowboys' : 'DAL', 'Atlanta Falcons' : 'ATL', 'Tampa Bay Buccaneers' : 'TB', 'Cincinnati Bengals' : 'CIN', 'Minnesota Vikings' : 'MIN', 'Carolina Panthers' : 'CAR', 'Chicago Bears' : 'CHI', 'Los Angeles Chargers' : 'LAC', 'Los Angeles Rams' : 'LA', 'Tennessee Titans' : 'TEN', 'Arizona Cardinals' : 'ARI', 'New York Giants' : 'NYG', 'Washington Commanders' : 'WAS', 'Jacksonville Jaguars' : 'JAX', 'Miami Dolphins' : 'MIA', 'Detroit Lions' : 'DET', 'Las Vegas Raiders' : 'LV', 'Cleveland Browns' : 'CLE', 'New York Jets' : 'NYJ'}

In [26]:
spread_df['Home Team'] = spread_df['Home Team'].replace(teams_dict)

In [27]:
spread_df['Away Team'] = spread_df['Away Team'].replace(teams_dict)

In [28]:
spread_df.head()

Unnamed: 0,Date,Home Team,Away Team,Home Score,Away Score,Home Line Open,Away Line Open,Total Score Open
0,2024-02-11,KC,SF,25,22,1.5,-1.5,47.5
1,2024-01-28,SF,DET,34,31,-7.0,7.0,51.5
2,2024-01-28,BAL,KC,10,17,-3.5,3.5,44.5
3,2024-01-21,BUF,KC,24,27,-2.5,2.5,46.0
4,2024-01-21,DET,TB,31,23,-5.5,5.5,48.5


In [29]:
spread_df['year'] = pd.DatetimeIndex(spread_df['Date']).year

In [30]:
spread_df['month'] = pd.DatetimeIndex(spread_df['Date']).month

In [31]:
spread_df['Season'] = spread_df['year']

In [32]:
spread_df['Season'] = np.where(spread_df.month == 1 , spread_df.Season - 1, spread_df.Season)

In [33]:
spread_df.head()

Unnamed: 0,Date,Home Team,Away Team,Home Score,Away Score,Home Line Open,Away Line Open,Total Score Open,year,month,Season
0,2024-02-11,KC,SF,25,22,1.5,-1.5,47.5,2024,2,2024
1,2024-01-28,SF,DET,34,31,-7.0,7.0,51.5,2024,1,2023
2,2024-01-28,BAL,KC,10,17,-3.5,3.5,44.5,2024,1,2023
3,2024-01-21,BUF,KC,24,27,-2.5,2.5,46.0,2024,1,2023
4,2024-01-21,DET,TB,31,23,-5.5,5.5,48.5,2024,1,2023


In [34]:
week_odds = pd.read_csv('odds_w14_spread.csv')

In [35]:
week_odds

Unnamed: 0,away_team,home_team,Total Score Open,Date,Season,Home Spread,Home_Implied_Total,Away_Implied_Total
0,GB,DET,53.0,12/5/2024,2024,-3.0,28.0,25.0
1,NYJ,MIA,45.0,12/8/2024,2024,-6.0,25.5,19.5
2,ATL,MIN,46.5,12/8/2024,2024,-6.0,26.25,20.25
3,NO,NYG,41.0,12/8/2024,2024,5.5,17.75,23.25
4,CAR,PHI,45.0,12/8/2024,2024,-14.0,29.5,15.5
5,CLE,PIT,43.5,12/8/2024,2024,-6.0,24.75,18.75
6,LV,TB,47.0,12/8/2024,2024,-6.5,26.75,20.25
7,JAX,TEN,40.0,12/8/2024,2024,-3.0,21.5,18.5
8,SEA,ARI,44.5,12/8/2024,2024,-2.5,23.5,21.0
9,BUF,LAR,50.0,12/8/2024,2024,3.5,23.25,26.75


In [36]:
spread_df['Date'] = pd.to_datetime(spread_df['Date'])

In [37]:
spread_df['Date'] = spread_df['Date'].astype('object')

In [38]:
spread_df['Season'] = spread_df['Season'].astype('string')

In [39]:
spread_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4861 entries, 0 to 4860
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date              4861 non-null   object 
 1   Home Team         4861 non-null   object 
 2   Away Team         4861 non-null   object 
 3   Home Score        4861 non-null   int64  
 4   Away Score        4861 non-null   int64  
 5   Home Line Open    4861 non-null   float64
 6   Away Line Open    2725 non-null   float64
 7   Total Score Open  4861 non-null   float64
 8   year              4861 non-null   int32  
 9   month             4861 non-null   int32  
 10  Season            4861 non-null   string 
dtypes: float64(3), int32(2), int64(2), object(3), string(1)
memory usage: 379.9+ KB


In [40]:
spread_df['Season'] = spread_df['Season'].astype('object')

In [41]:
rename_dict = {'home_team' : 'Home Team', 'away_team' : 'Away Team'}

In [42]:

def calculate_implied_totals(df, over_under_col, spread_col, home_team_col, away_team_col):
    """
    Calculate implied team totals (home and away) based on the over/under and spread.
    
    Args:
    df (pd.DataFrame): DataFrame containing the NFL game data.
    over_under_col (str): Column name for the over/under values.
    spread_col (str): Column name for the spread values.
    home_team_col (str): Column name for the home team names.
    away_team_col (str): Column name for the away team names.
    
    Returns:
    pd.DataFrame: DataFrame with added columns for home and away implied totals.
    """
    
    # Calculate implied away team points
    df['Away_Implied_Total'] = (df[over_under_col] - df[spread_col]) / 2
    
    # Calculate implied home team points
    df['Home_Implied_Total'] = (df[over_under_col] + df[spread_col]) / 2
    
    return df

In [43]:
calculate_implied_totals(spread_df, 'Total Score Open', 'Away Line Open', 'Home Team', 'Away Team')

Unnamed: 0,Date,Home Team,Away Team,Home Score,Away Score,Home Line Open,Away Line Open,Total Score Open,year,month,Season,Away_Implied_Total,Home_Implied_Total
0,2024-02-11 00:00:00,KC,SF,25,22,1.5,-1.5,47.5,2024,2,2024,24.50,23.00
1,2024-01-28 00:00:00,SF,DET,34,31,-7.0,7.0,51.5,2024,1,2023,22.25,29.25
2,2024-01-28 00:00:00,BAL,KC,10,17,-3.5,3.5,44.5,2024,1,2023,20.50,24.00
3,2024-01-21 00:00:00,BUF,KC,24,27,-2.5,2.5,46.0,2024,1,2023,21.75,24.25
4,2024-01-21 00:00:00,DET,TB,31,23,-5.5,5.5,48.5,2024,1,2023,21.50,27.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4856,2006-09-10 00:00:00,DET,SEA,6,9,6.0,,44.0,2006,9,2006,,
4857,2006-09-10 00:00:00,CLE,NO,14,19,-3.0,,36.5,2006,9,2006,,
4858,2006-09-10 00:00:00,CAR,ATL,6,20,-4.5,,38.5,2006,9,2006,,
4859,2006-09-10 00:00:00,ARI,SF,34,27,-9.5,,43.5,2006,9,2006,,


In [44]:
week_odds.rename(rename_dict, axis = 1, inplace = True)

In [45]:
col_spread_keep = ['Date', 'Season', 'Home Team', 'Away Team', 'Home_Implied_Total', 'Away_Implied_Total']

In [46]:
week_odds['Season'] = week_odds['Season'].astype('string')

In [47]:
week_odds['Season'] = week_odds['Season'].astype('object')

In [48]:
week_odds = week_odds[col_spread_keep].copy()

In [49]:
spread_df_2 = spread_df[col_spread_keep].copy()

In [50]:
spread_df_2 = pd.concat([spread_df_2, week_odds], join = 'outer', axis = 0)

In [51]:
spread_df_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4874 entries, 0 to 12
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                4874 non-null   object 
 1   Season              4874 non-null   object 
 2   Home Team           4874 non-null   object 
 3   Away Team           4874 non-null   object 
 4   Home_Implied_Total  2738 non-null   float64
 5   Away_Implied_Total  2738 non-null   float64
dtypes: float64(2), object(4)
memory usage: 266.5+ KB


In [52]:
spread_df_2 = spread_df_2[col_spread_keep].copy()

In [53]:
spread_df_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4874 entries, 0 to 12
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                4874 non-null   object 
 1   Season              4874 non-null   object 
 2   Home Team           4874 non-null   object 
 3   Away Team           4874 non-null   object 
 4   Home_Implied_Total  2738 non-null   float64
 5   Away_Implied_Total  2738 non-null   float64
dtypes: float64(2), object(4)
memory usage: 266.5+ KB


In [54]:
spread_df_2.dropna(inplace = True)

In [55]:
spread_df_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2738 entries, 0 to 12
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                2738 non-null   object 
 1   Season              2738 non-null   object 
 2   Home Team           2738 non-null   object 
 3   Away Team           2738 non-null   object 
 4   Home_Implied_Total  2738 non-null   float64
 5   Away_Implied_Total  2738 non-null   float64
dtypes: float64(2), object(4)
memory usage: 149.7+ KB


In [56]:
o_stats = pd.read_csv('sportsref_offense_w14.csv')

In [57]:
d_stats = pd.read_csv('sportsref_defense_w14.csv')

In [58]:
new_stats = pd.merge(o_stats, d_stats, how = 'inner', on = ['Season', 'Team'])

In [59]:
new_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 22 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Season   192 non-null    float64
 1   Team     192 non-null    object 
 2   W-L%     192 non-null    float64
 3   Cmp%     192 non-null    float64
 4   TD%      192 non-null    float64
 5   Int%     192 non-null    float64
 6   Rate     192 non-null    float64
 7   Sk%      192 non-null    float64
 8   ANY/A    192 non-null    float64
 9   Y/P      192 non-null    float64
 10  3D%      192 non-null    float64
 11  4D%      192 non-null    float64
 12  DADOT    192 non-null    float64
 13  oCmp%    192 non-null    float64
 14  oRate    192 non-null    float64
 15  DY/P     192 non-null    float64
 16  Opp3D%   192 non-null    float64
 17  Opp4D%   192 non-null    float64
 18  Drop%    192 non-null    float64
 19  Prss%    192 non-null    float64
 20  Bad%     192 non-null    float64
 21  PktTime  192 non

In [60]:
new_stats = new_stats.loc[:,~new_stats.columns.duplicated()]

In [61]:
new_stats['Season'] = new_stats['Season'].astype('string')

In [62]:
new_stats.isnull()

Unnamed: 0,Season,Team,W-L%,Cmp%,TD%,Int%,Rate,Sk%,ANY/A,Y/P,...,DADOT,oCmp%,oRate,DY/P,Opp3D%,Opp4D%,Drop%,Prss%,Bad%,PktTime
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
192,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
193,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
194,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [63]:
new_stats.dropna(inplace = True)

In [64]:
new_stats.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192 entries, 0 to 191
Data columns (total 22 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Season   192 non-null    string 
 1   Team     192 non-null    object 
 2   W-L%     192 non-null    float64
 3   Cmp%     192 non-null    float64
 4   TD%      192 non-null    float64
 5   Int%     192 non-null    float64
 6   Rate     192 non-null    float64
 7   Sk%      192 non-null    float64
 8   ANY/A    192 non-null    float64
 9   Y/P      192 non-null    float64
 10  3D%      192 non-null    float64
 11  4D%      192 non-null    float64
 12  DADOT    192 non-null    float64
 13  oCmp%    192 non-null    float64
 14  oRate    192 non-null    float64
 15  DY/P     192 non-null    float64
 16  Opp3D%   192 non-null    float64
 17  Opp4D%   192 non-null    float64
 18  Drop%    192 non-null    float64
 19  Prss%    192 non-null    float64
 20  Bad%     192 non-null    float64
 21  PktTime  192 non-null

In [65]:
new_stats['Season'] = new_stats['Season'].astype('object')

In [66]:
new_stats.nunique()

Season       6
Team        33
W-L%        45
Cmp%        99
TD%         56
Int%        37
Rate       155
Sk%        168
ANY/A      159
Y/P        118
3D%        121
4D%        102
DADOT       37
oCmp%       99
oRate      138
DY/P       110
Opp3D%     108
Opp4D%      96
Drop%       53
Prss%      108
Bad%        89
PktTime      7
dtype: int64

In [67]:
new_stats['Season'] = new_stats['Season'].map(lambda x : x.rstrip('.0'))

In [68]:
team_rename = {'GNB' : 'GB' , 'KAN' : 'KC', 'NOR' : 'NO', 'NWE' : 'NE', 'LVR' : 'LV', 'SFO' : 'SF', 'TAM' : 'TB', 'OAK' : 'LV'}

In [69]:
new_stats.Team.replace(team_rename, inplace = True)

In [70]:
new_stats.Season.replace({'202' : '2020'}, inplace = True)

In [71]:
new_stats.head(32)

Unnamed: 0,Season,Team,W-L%,Cmp%,TD%,Int%,Rate,Sk%,ANY/A,Y/P,...,DADOT,oCmp%,oRate,DY/P,Opp3D%,Opp4D%,Drop%,Prss%,Bad%,PktTime
0,2024,ATL,0.5,67.2,4.2,3.2,90.5,5.81,6.21,5.72,...,6.5,71.6,100.4,5.49,44.7,42.1,3.1,21.8,12.1,2.4
1,2024,BUF,0.833,64.7,5.6,1.4,100.0,3.49,7.54,5.81,...,6.5,68.5,86.6,5.38,40.0,41.7,5.3,16.1,17.5,2.3
2,2024,CAR,0.25,61.9,3.3,3.0,76.9,5.04,4.71,5.03,...,7.7,68.8,102.5,5.79,46.8,54.5,5.3,21.3,19.8,2.2
3,2024,CHI,0.333,61.4,3.4,1.2,85.9,10.63,5.07,4.63,...,8.2,62.7,85.7,5.75,34.2,50.0,3.4,23.0,23.4,2.4
4,2024,CIN,0.333,67.7,6.7,1.1,107.4,6.3,7.45,5.77,...,7.9,65.2,98.0,5.66,44.4,68.4,4.7,21.9,12.8,2.3
5,2024,CLE,0.25,61.4,3.4,2.5,80.4,9.6,4.8,4.74,...,9.9,61.3,96.7,5.8,35.3,61.5,7.2,27.5,16.6,2.4
6,2024,IND,0.462,56.0,4.1,3.6,76.6,5.85,5.35,5.35,...,7.3,70.4,97.1,5.65,45.3,57.9,5.1,22.6,22.4,2.3
7,2024,ARI,0.5,68.7,3.6,1.7,94.8,5.54,6.49,5.88,...,7.1,68.6,95.4,5.41,46.2,55.6,4.0,16.5,13.8,2.5
8,2024,DAL,0.417,63.0,3.3,2.3,82.4,5.89,5.24,4.94,...,8.2,66.9,96.4,5.79,34.2,70.0,3.2,21.9,17.3,2.4
9,2024,DEN,0.615,64.1,4.1,1.8,89.0,4.17,5.96,5.21,...,7.0,64.5,85.1,4.87,37.4,60.0,4.8,18.1,17.4,2.5


In [72]:
home_spread = spread_df_2.drop(axis = 1, labels = 'Away Team')

In [73]:
away_spread = spread_df_2.drop(axis = 1, labels = 'Home Team')

In [74]:
home_spread['Team'] = home_spread['Home Team']

In [75]:
away_spread['Team'] = away_spread['Away Team']

In [76]:
home_spread_for_test = pd.merge(home_spread, new_stats, how = 'left', on = ['Team' , 'Season'])

In [77]:
away_spread_for_test = pd.merge(away_spread, new_stats, how = 'left', on = ['Team' , 'Season'])

In [78]:
home_spread_for_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2738 entries, 0 to 2737
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                2738 non-null   object 
 1   Season              2738 non-null   object 
 2   Home Team           2738 non-null   object 
 3   Home_Implied_Total  2738 non-null   float64
 4   Away_Implied_Total  2738 non-null   float64
 5   Team                2738 non-null   object 
 6   W-L%                1360 non-null   float64
 7   Cmp%                1360 non-null   float64
 8   TD%                 1360 non-null   float64
 9   Int%                1360 non-null   float64
 10  Rate                1360 non-null   float64
 11  Sk%                 1360 non-null   float64
 12  ANY/A               1360 non-null   float64
 13  Y/P                 1360 non-null   float64
 14  3D%                 1360 non-null   float64
 15  4D%                 1360 non-null   float64
 16  DADOT 

In [79]:
home_spread_for_test.dropna(inplace = True)

In [80]:
away_spread_for_test.dropna(inplace = True)

In [81]:
away_spread_for_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1357 entries, 0 to 2737
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                1357 non-null   object 
 1   Season              1357 non-null   object 
 2   Away Team           1357 non-null   object 
 3   Home_Implied_Total  1357 non-null   float64
 4   Away_Implied_Total  1357 non-null   float64
 5   Team                1357 non-null   object 
 6   W-L%                1357 non-null   float64
 7   Cmp%                1357 non-null   float64
 8   TD%                 1357 non-null   float64
 9   Int%                1357 non-null   float64
 10  Rate                1357 non-null   float64
 11  Sk%                 1357 non-null   float64
 12  ANY/A               1357 non-null   float64
 13  Y/P                 1357 non-null   float64
 14  3D%                 1357 non-null   float64
 15  4D%                 1357 non-null   float64
 16  DADOT      

In [82]:
trim_nfl = pd.read_csv('trim_nfl')

In [83]:
trim_nfl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4655 entries, 0 to 4654
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        4655 non-null   int64  
 1   Date              4655 non-null   object 
 2   Home Team         4655 non-null   object 
 3   Away Team         4655 non-null   object 
 4   Home Score        4655 non-null   int64  
 5   Away Score        4655 non-null   int64  
 6   Total Score Open  4655 non-null   float64
 7   Game_Total        4655 non-null   int64  
 8   Overtime          4655 non-null   object 
 9   year              4655 non-null   int64  
 10  month             4655 non-null   int64  
 11  season            4655 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 436.5+ KB


In [84]:
home_spread_for_test.head()

Unnamed: 0,Date,Season,Home Team,Home_Implied_Total,Away_Implied_Total,Team,W-L%,Cmp%,TD%,Int%,...,DADOT,oCmp%,oRate,DY/P,Opp3D%,Opp4D%,Drop%,Prss%,Bad%,PktTime
0,2024-02-11 00:00:00,2024,KC,23.0,24.5,KC,0.917,68.4,4.4,2.6,...,7.5,65.5,94.0,5.3,39.5,50.0,4.2,23.6,15.2,2.3
1,2024-01-28 00:00:00,2023,SF,29.25,22.25,SF,0.706,68.4,6.7,2.4,...,7.2,66.0,79.6,4.98,40.9,50.0,1.9,19.9,17.3,2.7
2,2024-01-28 00:00:00,2023,BAL,24.0,20.5,BAL,0.765,66.4,5.5,1.4,...,7.6,60.6,74.6,4.62,36.4,40.6,5.5,15.3,16.8,2.7
3,2024-01-21 00:00:00,2023,BUF,24.25,21.75,BUF,0.647,66.5,5.0,3.1,...,6.9,65.8,81.9,5.14,38.6,52.6,5.6,19.3,14.2,2.3
4,2024-01-21 00:00:00,2023,DET,27.0,21.5,DET,0.706,67.3,5.0,2.0,...,9.2,63.1,91.5,5.51,37.1,57.7,6.1,22.0,15.1,2.3


In [85]:
new_stats.Team.unique()

array(['ATL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'IND', 'ARI', 'DAL',
       'DEN', 'DET', 'GB', 'HOU', 'JAX', 'KC', 'MIA', 'MIN', 'NO', 'NE',
       'NYG', 'NYJ', 'TEN', 'PHI', 'PIT', 'LV', 'LAR', 'BAL', 'LAC',
       'SEA', 'SF', 'TB', 'WAS'], dtype=object)

In [86]:
new_stats.Team.unique()

array(['ATL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'IND', 'ARI', 'DAL',
       'DEN', 'DET', 'GB', 'HOU', 'JAX', 'KC', 'MIA', 'MIN', 'NO', 'NE',
       'NYG', 'NYJ', 'TEN', 'PHI', 'PIT', 'LV', 'LAR', 'BAL', 'LAC',
       'SEA', 'SF', 'TB', 'WAS'], dtype=object)

In [87]:
new_stats.Season.value_counts()

Season
2024    32
2023    32
2022    32
2021    32
2020    32
2019    32
Name: count, dtype: int64

In [88]:
new_stats.Season.value_counts()

Season
2024    32
2023    32
2022    32
2021    32
2020    32
2019    32
Name: count, dtype: int64

In [89]:
seasons_stats = new_stats.Season.unique()

In [90]:
seasons_stats

array(['2024', '2023', '2022', '2021', '2020', '2019'], dtype=object)

In [91]:
home_rename = {'W-L%' : 'w_l_x', 'Sk%' : 'sack_x', 'Cmp%' : 'cmp_pct_x', 'ANY/A' : 'adj_net_yd_att_x', 
               'DY/P' : 'opp_yds_all_x', '3D%' : 'third_dwn_pct_x', 'DADOT' : 'dadot_x', '4D%' : 'four_dwn_pct_x', 'oRate' : 'opp_rate_x',
               'Y/P' : 'yds_ply_x', 'TD%' : 'td_pct_x', 'Int%' : 'int_pct_x', 'Rate' : 'rate_x', 'Opp3D%' : 'opp_3d_x', 'Opp4D%' : 'opp_4d_x', 
               'Prss%' : 'press_pct_x', 'Drop%' : 'drop_pct_x', 'oCmp%' : 'opp_cmp_pct_x', 'Bad%' : 'bad_pct_x', 'PktTime': 'time_x'}

In [92]:
away_rename = {'W-L%' : 'w_l_y', 'Sk%' : 'sack_y', 'Cmp%' : 'cmp_pct_y', 'ANY/A' : 'adj_net_yd_att_y', 
               'DY/P' : 'opp_yds_all_y', '3D%' : 'third_dwn_pct_y', 'DADOT' : 'dadot_y', '4D%' : 'four_dwn_pct_y', 'oRate' : 'opp_rate_y',
               'Y/P' : 'yds_ply_y', 'TD%' : 'td_pct_y', 'Int%' : 'int_pct_y', 'Rate' : 'rate_y', 'Opp3D%' : 'opp_3d_y', 'Opp4D%' : 'opp_4d_y', 
               'Prss%' : 'press_pct_y', 'Drop%' : 'drop_pct_y', 'oCmp%' : 'opp_cmp_pct_y', 'Bad%' : 'bad_pct_y', 'PktTime' : 'time_y'}

In [93]:
home_spread_for_test.rename(columns = home_rename, inplace = True)

In [94]:
away_spread_for_test.rename(columns = away_rename, inplace = True)

In [95]:
full_df = pd.concat([home_spread_for_test, away_spread_for_test], join = 'outer', axis = 1).reset_index(drop = True)

In [96]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1404 entries, 0 to 1403
Data columns (total 52 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                1360 non-null   object 
 1   Season              1360 non-null   object 
 2   Home Team           1360 non-null   object 
 3   Home_Implied_Total  1360 non-null   float64
 4   Away_Implied_Total  1360 non-null   float64
 5   Team                1360 non-null   object 
 6   w_l_x               1360 non-null   float64
 7   cmp_pct_x           1360 non-null   float64
 8   td_pct_x            1360 non-null   float64
 9   int_pct_x           1360 non-null   float64
 10  rate_x              1360 non-null   float64
 11  sack_x              1360 non-null   float64
 12  adj_net_yd_att_x    1360 non-null   float64
 13  yds_ply_x           1360 non-null   float64
 14  third_dwn_pct_x     1360 non-null   float64
 15  four_dwn_pct_x      1360 non-null   float64
 16  dadot_

In [97]:
full_df = full_df.loc[:,~full_df.columns.duplicated()]

In [98]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1404 entries, 0 to 1403
Data columns (total 47 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                1360 non-null   object 
 1   Season              1360 non-null   object 
 2   Home Team           1360 non-null   object 
 3   Home_Implied_Total  1360 non-null   float64
 4   Away_Implied_Total  1360 non-null   float64
 5   Team                1360 non-null   object 
 6   w_l_x               1360 non-null   float64
 7   cmp_pct_x           1360 non-null   float64
 8   td_pct_x            1360 non-null   float64
 9   int_pct_x           1360 non-null   float64
 10  rate_x              1360 non-null   float64
 11  sack_x              1360 non-null   float64
 12  adj_net_yd_att_x    1360 non-null   float64
 13  yds_ply_x           1360 non-null   float64
 14  third_dwn_pct_x     1360 non-null   float64
 15  four_dwn_pct_x      1360 non-null   float64
 16  dadot_

In [99]:
full_df.head()

Unnamed: 0,Date,Season,Home Team,Home_Implied_Total,Away_Implied_Total,Team,w_l_x,cmp_pct_x,td_pct_x,int_pct_x,...,dadot_y,opp_cmp_pct_y,opp_rate_y,opp_yds_all_y,opp_3d_y,opp_4d_y,drop_pct_y,press_pct_y,bad_pct_y,time_y
0,2024-02-11 00:00:00,2024,KC,23.0,24.5,KC,0.917,68.4,4.4,2.6,...,7.9,63.8,84.2,5.19,44.2,60.0,5.3,20.6,15.2,2.6
1,2024-01-28 00:00:00,2023,SF,29.25,22.25,SF,0.706,68.4,6.7,2.4,...,9.2,63.1,91.5,5.51,37.1,57.7,6.1,22.0,15.1,2.3
2,2024-01-28 00:00:00,2023,BAL,24.0,20.5,BAL,0.765,66.4,5.5,1.4,...,7.5,61.2,83.6,4.71,37.1,46.4,7.3,22.9,13.6,2.5
3,2024-01-21 00:00:00,2023,BUF,24.25,21.75,BUF,0.647,66.5,5.0,3.1,...,7.5,61.2,83.6,4.71,37.1,46.4,7.3,22.9,13.6,2.5
4,2024-01-21 00:00:00,2023,DET,27.0,21.5,DET,0.706,67.3,5.0,2.0,...,7.9,66.1,92.0,5.41,40.9,48.0,4.5,17.6,14.7,2.4


In [100]:
full_df.isna().sum()

Date                  44
Season                44
Home Team             44
Home_Implied_Total    44
Away_Implied_Total    44
Team                  44
w_l_x                 44
cmp_pct_x             44
td_pct_x              44
int_pct_x             44
rate_x                44
sack_x                44
adj_net_yd_att_x      44
yds_ply_x             44
third_dwn_pct_x       44
four_dwn_pct_x        44
dadot_x               44
opp_cmp_pct_x         44
opp_rate_x            44
opp_yds_all_x         44
opp_3d_x              44
opp_4d_x              44
drop_pct_x            44
press_pct_x           44
bad_pct_x             44
time_x                44
Away Team             47
w_l_y                 47
cmp_pct_y             47
td_pct_y              47
int_pct_y             47
rate_y                47
sack_y                47
adj_net_yd_att_y      47
yds_ply_y             47
third_dwn_pct_y       47
four_dwn_pct_y        47
dadot_y               47
opp_cmp_pct_y         47
opp_rate_y            47


In [101]:
full_df.dropna(inplace = True)

In [102]:
full_df.head()

Unnamed: 0,Date,Season,Home Team,Home_Implied_Total,Away_Implied_Total,Team,w_l_x,cmp_pct_x,td_pct_x,int_pct_x,...,dadot_y,opp_cmp_pct_y,opp_rate_y,opp_yds_all_y,opp_3d_y,opp_4d_y,drop_pct_y,press_pct_y,bad_pct_y,time_y
0,2024-02-11 00:00:00,2024,KC,23.0,24.5,KC,0.917,68.4,4.4,2.6,...,7.9,63.8,84.2,5.19,44.2,60.0,5.3,20.6,15.2,2.6
1,2024-01-28 00:00:00,2023,SF,29.25,22.25,SF,0.706,68.4,6.7,2.4,...,9.2,63.1,91.5,5.51,37.1,57.7,6.1,22.0,15.1,2.3
2,2024-01-28 00:00:00,2023,BAL,24.0,20.5,BAL,0.765,66.4,5.5,1.4,...,7.5,61.2,83.6,4.71,37.1,46.4,7.3,22.9,13.6,2.5
3,2024-01-21 00:00:00,2023,BUF,24.25,21.75,BUF,0.647,66.5,5.0,3.1,...,7.5,61.2,83.6,4.71,37.1,46.4,7.3,22.9,13.6,2.5
4,2024-01-21 00:00:00,2023,DET,27.0,21.5,DET,0.706,67.3,5.0,2.0,...,7.9,66.1,92.0,5.41,40.9,48.0,4.5,17.6,14.7,2.4


In [103]:

home_train = full_df.loc[full_df['Season'] != '2024'].copy()
home_test = full_df.loc[full_df['Season'] == '2024'].copy()
home_X_train = home_train.drop('Home_Implied_Total', axis = 1)
home_y_train= home_train['Home_Implied_Total'].copy()
home_X_test = home_test.drop('Home_Implied_Total', axis = 1)
home_y_test = home_test['Home_Implied_Total'].copy()

In [104]:
home_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1299 entries, 1 to 1346
Data columns (total 47 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                1299 non-null   object 
 1   Season              1299 non-null   object 
 2   Home Team           1299 non-null   object 
 3   Home_Implied_Total  1299 non-null   float64
 4   Away_Implied_Total  1299 non-null   float64
 5   Team                1299 non-null   object 
 6   w_l_x               1299 non-null   float64
 7   cmp_pct_x           1299 non-null   float64
 8   td_pct_x            1299 non-null   float64
 9   int_pct_x           1299 non-null   float64
 10  rate_x              1299 non-null   float64
 11  sack_x              1299 non-null   float64
 12  adj_net_yd_att_x    1299 non-null   float64
 13  yds_ply_x           1299 non-null   float64
 14  third_dwn_pct_x     1299 non-null   float64
 15  four_dwn_pct_x      1299 non-null   float64
 16  dadot_x    

In [105]:
rf = RandomForestRegressor(n_estimators = 50, min_samples_split =10, random_state = 151)

In [106]:
predictors_1 = [ 'w_l_x', 'sack_x', 'cmp_pct_x', 'adj_net_yd_att_x', 
                'opp_yds_all_x', 'third_dwn_pct_x', 'dadot_x',  'four_dwn_pct_x', 'opp_rate_x',
                'yds_ply_x',  'td_pct_x',  'int_pct_x',  'rate_x',  'opp_3d_x',  'opp_4d_x', 
                'press_pct_x',  'drop_pct_x',  'opp_cmp_pct_x', 'bad_pct_x','time_x','w_l_y', 'sack_y', 'cmp_pct_y', 'adj_net_yd_att_y', 
             'opp_yds_all_y', 'third_dwn_pct_y',  'dadot_y', 'four_dwn_pct_y',  'opp_rate_y',
             'yds_ply_y', 'td_pct_y', 'int_pct_y', 'rate_y', 'opp_3d_y', 'opp_4d_y', 
             'press_pct_y', 'drop_pct_y', 'opp_cmp_pct_y', 'bad_pct_y', 'time_y']

In [107]:
rf.fit(home_X_train[predictors_1], home_y_train)

In [108]:
preds1 = rf.predict(home_X_test[predictors_1])

In [109]:
from sklearn import metrics


def metrics_test(test, preds):
    print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test, preds))
    print('Mean Squared Error (MSE):', metrics.mean_squared_error(test, preds))
    print('Root Mean Squared Error (RMSE):', metrics.mean_squared_error(test, preds, squared=False))
    print('Mean Absolute Percentage Error (MAPE):', metrics.mean_absolute_percentage_error(test, preds))
    print('Explained Variance Score:', metrics.explained_variance_score(test, preds))
    print('Max Error:', metrics.max_error(test, preds))
    print('Mean Squared Log Error:', metrics.mean_squared_log_error(test, preds))
    print('Median Absolute Error:', metrics.median_absolute_error(test, preds))
    print('R^2:', metrics.r2_score(test, preds))
    print('Mean Poisson Deviance:', metrics.mean_poisson_deviance(test, preds))
    print('Mean Gamma Deviance:', metrics.mean_gamma_deviance(test, preds))

In [110]:
away_train = full_df.loc[full_df['Season'] != '2024'].copy()
away_test = full_df.loc[full_df['Season'] == '2024'].copy()
away_X_train = away_train.drop('Away_Implied_Total', axis = 1)
away_y_train= away_train['Away_Implied_Total'].copy()
away_X_test = away_test.drop('Away_Implied_Total', axis = 1)
away_y_test = away_test['Away_Implied_Total'].copy()

In [111]:
rf.fit(away_X_train[predictors_1], away_y_train)

In [112]:
preds2 = rf.predict(away_X_test[predictors_1])

In [113]:
results_df = home_X_test[['Home Team', 'Away Team']]

In [114]:
results_df['Home Score'] = home_y_test.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['Home Score'] = home_y_test.copy()


In [115]:
results_df['Away Score'] = away_y_test.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['Away Score'] = away_y_test.copy()


In [116]:
results_df['pred_home'] = preds1.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['pred_home'] = preds1.copy()


In [117]:
results_df['pred_away'] = preds2.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['pred_away'] = preds2.copy()


In [118]:
compare_odds_df = results_df[['Home Team', 'Away Team','Home Score','Away Score', 'pred_home', 'pred_away']].copy()

In [119]:
compare_odds_df['pred_home_spread'] = compare_odds_df['pred_away'] - compare_odds_df['pred_home']

In [120]:
compare_odds_df['Vegas_Home_Spread'] = compare_odds_df['Away Score'] - compare_odds_df['Home Score']

In [121]:
compare_odds_df.drop(0, inplace = True)

In [122]:
compare_odds_df['Vegas_Over_Under'] = compare_odds_df['Away Score'] + compare_odds_df['Home Score']

In [123]:
compare_odds_df['pred_over_under'] = compare_odds_df['pred_away'] + compare_odds_df['pred_home']

In [124]:
compare_odds_df.head(16)

Unnamed: 0,Home Team,Away Team,Home Score,Away Score,pred_home,pred_away,pred_home_spread,Vegas_Home_Spread,Vegas_Over_Under,pred_over_under
1347,DET,GB,28.0,25.0,26.45106,22.195412,-4.255648,-3.0,53.0,48.646472
1348,MIA,NYJ,25.5,19.5,23.607359,19.952208,-3.655151,-6.0,45.0,43.559567
1349,MIN,ATL,26.25,20.25,28.062037,19.85433,-8.207707,-6.0,46.5,47.916368
1350,NYG,NO,17.75,23.25,21.32882,23.60028,2.27146,5.5,41.0,44.9291
1351,PHI,CAR,29.5,15.5,27.939614,17.715752,-10.223863,-14.0,45.0,45.655366
1352,PIT,CLE,24.75,18.75,25.660277,18.215542,-7.444735,-6.0,43.5,43.875818
1353,TB,LV,26.75,20.25,25.100443,19.84824,-5.252202,-6.5,47.0,44.948683
1354,TEN,JAX,21.5,18.5,22.960949,19.206121,-3.754828,-3.0,40.0,42.16707
1355,ARI,SEA,23.5,21.0,23.142528,22.622857,-0.519671,-2.5,44.5,45.765385
1356,LAR,BUF,23.25,26.75,23.658684,26.218063,2.559379,3.5,50.0,49.876748


## Algorithm Projected Line