# 2024 NFL Season O/U Alg

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import statsmodels.formula.api as smf
from patsy import dmatrices
from xgboost import XGBClassifier
from sklearn import preprocessing, model_selection, metrics
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from matplotlib.legend_handler import HandlerLine2D
from collections import OrderedDict
from sklearn.preprocessing import OneHotEncoder

In [2]:
import pickle

In [3]:
df_model = pd.read_csv('model_data_nfl')

In [4]:
week_1 = pd.read_csv('odds_w1.csv')

In [5]:
week_1

Unnamed: 0,home_team,away_team,Total Score Open,Date
0,KC,BAL,46.5,9/5/2024
1,PHI,GB,48.5,9/6/2024
2,IND,HOU,49.0,9/8/2024
3,NO,CAR,41.5,9/8/2024
4,BUF,ARI,48.0,9/8/2024
5,NYG,MIN,41.5,9/8/2024
6,CIN,NE,40.5,9/8/2024
7,ATL,PIT,42.0,9/8/2024
8,MIA,JAX,49.0,9/8/2024
9,CHI,TEN,44.5,9/8/2024


In [6]:
week_1['Date'] = pd.to_datetime(week_1['Date'])

In [7]:
week_1['Date'] = week_1['Date'].astype('object')

In [8]:
week_1['Season'] = '2023'

In [9]:
week_1['Season'] = week_1['Season'].astype('object')

In [10]:
df_model.head()

Unnamed: 0.1,Unnamed: 0,Date,Home Team,Home Score,Total Score Open,Game_Total,Overtime,season,Away Team,Away Score,...,defense_completion_percentage_y,defense_ave_yards_gained_pass_y,defense_ave_yards_gained_run_y,defense_ave_air_yards_y,defense_ave_yac_y,defense_ave_epa_pass_y,defense_ave_epa_run_y,defense_success_rate_pass_y,defense_success_rate_run_y,score_differential_y
0,0,2023-01-08,GB,16,48.5,36,N,2022,DET,20,...,0.584437,6.925497,5.400862,9.511628,4.705382,0.046447,0.118396,0.463576,0.476293,26.0
1,1,2023-01-08,ATL,30,40.0,47,N,2022,TB,17,...,0.592834,5.643322,4.604911,8.118584,4.796703,-0.013615,-0.040508,0.429967,0.410714,-45.0
2,2,2023-01-08,BUF,35,43.0,58,N,2022,NE,23,...,0.558282,5.648773,4.241784,8.84396,4.752747,-0.105847,-0.063886,0.417178,0.415493,17.0
3,3,2023-01-08,CHI,13,45.5,42,N,2022,MIN,29,...,0.621951,6.88872,4.592998,7.939542,5.482843,0.031256,-0.015597,0.460366,0.442013,-3.0
4,4,2023-01-08,CIN,27,43.0,43,N,2022,BAL,16,...,0.613707,6.15109,3.967254,6.976351,4.786802,0.015447,-0.062208,0.450156,0.420655,35.0


In [11]:
df_model.nunique()

Unnamed: 0                         4383
Date                                812
Home Team                            32
Home Score                           58
Total Score Open                     58
Game_Total                           92
Overtime                              2
season                               17
Away Team                            32
Away Score                           54
offense_completion_percentage_x     533
offense_ave_yards_gained_pass_x     542
offense_ave_yards_gained_run_x      543
offense_ave_air_yards_x             544
offense_ave_yac_x                   543
offense_ave_epa_pass_x              544
offense_ave_epa_run_x               544
offense_success_rate_pass_x         531
offense_success_rate_run_x          526
defense_completion_percentage_x     535
defense_ave_yards_gained_pass_x     543
defense_ave_yards_gained_run_x      542
defense_ave_air_yards_x             544
defense_ave_yac_x                   542
defense_ave_epa_pass_x              544


In [12]:
df_model.drop(columns = 'Unnamed: 0', inplace = True)

In [13]:
rename_dict = {'home_team' : 'Home Team', 'away_team' : 'Away Team'}

In [14]:
week_1.rename(columns = rename_dict, inplace = True)

In [15]:
week_1

Unnamed: 0,Home Team,Away Team,Total Score Open,Date,Season
0,KC,BAL,46.5,2024-09-05 00:00:00,2023
1,PHI,GB,48.5,2024-09-06 00:00:00,2023
2,IND,HOU,49.0,2024-09-08 00:00:00,2023
3,NO,CAR,41.5,2024-09-08 00:00:00,2023
4,BUF,ARI,48.0,2024-09-08 00:00:00,2023
5,NYG,MIN,41.5,2024-09-08 00:00:00,2023
6,CIN,NE,40.5,2024-09-08 00:00:00,2023
7,ATL,PIT,42.0,2024-09-08 00:00:00,2023
8,MIA,JAX,49.0,2024-09-08 00:00:00,2023
9,CHI,TEN,44.5,2024-09-08 00:00:00,2023


In [16]:
week_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Home Team         16 non-null     object 
 1   Away Team         16 non-null     object 
 2   Total Score Open  16 non-null     float64
 3   Date              16 non-null     object 
 4   Season            16 non-null     object 
dtypes: float64(1), object(4)
memory usage: 772.0+ bytes


In [17]:
new_stats = pd.read_csv('football.csv')

In [18]:
new_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 18 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Season   160 non-null    float64
 1   Team     160 non-null    object 
 2   Pts      160 non-null    float64
 3   PtsO     160 non-null    float64
 4   Cmp%     160 non-null    float64
 5   AY/A     160 non-null    float64
 6   Y/P      160 non-null    float64
 7   OnTgt%   160 non-null    float64
 8   DADOT    160 non-null    float64
 9   IAY/PA   160 non-null    float64
 10  Drop%    160 non-null    float64
 11  ADOT     160 non-null    float64
 12  DADOT.1  160 non-null    float64
 13  TD%      160 non-null    float64
 14  Int%     160 non-null    float64
 15  Rate     160 non-null    float64
 16  TO       160 non-null    float64
 17  CAY/PA   160 non-null    float64
dtypes: float64(17), object(1)
memory usage: 22.8+ KB


In [19]:
new_stats.head()

Unnamed: 0,Season,Team,Pts,PtsO,Cmp%,AY/A,Y/P,OnTgt%,DADOT,IAY/PA,Drop%,ADOT,DADOT.1,TD%,Int%,Rate,TO,CAY/PA
0,,,,,,,,,,,,,,,,,,
1,2023.0,ATL,321.0,373.0,61.7,5.85,5.2,73.9,8.6,8.2,4.9,7.9,8.6,3.2,3.2,80.5,28.0,3.8
2,2023.0,BUF,451.0,311.0,66.5,6.78,5.71,77.8,6.9,8.7,5.6,8.1,6.9,5.0,3.1,92.2,28.0,4.0
3,2023.0,CAR,236.0,416.0,59.7,4.35,4.11,72.9,7.3,7.6,5.2,6.9,7.3,2.2,1.7,75.2,20.0,3.1
4,2023.0,CHI,360.0,379.0,62.6,5.46,5.01,74.1,7.3,7.4,3.9,6.8,7.3,3.7,2.9,82.2,25.0,3.3


In [20]:
new_stats = new_stats.iloc[1:161].copy()

In [21]:
new_stats.head()

Unnamed: 0,Season,Team,Pts,PtsO,Cmp%,AY/A,Y/P,OnTgt%,DADOT,IAY/PA,Drop%,ADOT,DADOT.1,TD%,Int%,Rate,TO,CAY/PA
1,2023.0,ATL,321.0,373.0,61.7,5.85,5.2,73.9,8.6,8.2,4.9,7.9,8.6,3.2,3.2,80.5,28.0,3.8
2,2023.0,BUF,451.0,311.0,66.5,6.78,5.71,77.8,6.9,8.7,5.6,8.1,6.9,5.0,3.1,92.2,28.0,4.0
3,2023.0,CAR,236.0,416.0,59.7,4.35,4.11,72.9,7.3,7.6,5.2,6.9,7.3,2.2,1.7,75.2,20.0,3.1
4,2023.0,CHI,360.0,379.0,62.6,5.46,5.01,74.1,7.3,7.4,3.9,6.8,7.3,3.7,2.9,82.2,25.0,3.3
5,2023.0,CIN,366.0,384.0,68.3,6.19,5.17,77.0,9.4,6.3,5.0,6.0,9.4,4.4,2.3,93.0,16.0,3.1


In [22]:
new_stats['Season'] = new_stats['Season'].astype('string')

In [23]:
new_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 1 to 160
Data columns (total 18 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Season   160 non-null    string 
 1   Team     160 non-null    object 
 2   Pts      160 non-null    float64
 3   PtsO     160 non-null    float64
 4   Cmp%     160 non-null    float64
 5   AY/A     160 non-null    float64
 6   Y/P      160 non-null    float64
 7   OnTgt%   160 non-null    float64
 8   DADOT    160 non-null    float64
 9   IAY/PA   160 non-null    float64
 10  Drop%    160 non-null    float64
 11  ADOT     160 non-null    float64
 12  DADOT.1  160 non-null    float64
 13  TD%      160 non-null    float64
 14  Int%     160 non-null    float64
 15  Rate     160 non-null    float64
 16  TO       160 non-null    float64
 17  CAY/PA   160 non-null    float64
dtypes: float64(16), object(1), string(1)
memory usage: 22.6+ KB


In [24]:
new_stats['Season'] = new_stats['Season'].map(lambda x : x.rstrip('.0'))

In [25]:
new_stats['Season'] = new_stats['Season'].astype('object')

In [26]:
team_rename = {'GNB' : 'GB' , 'KAN' : 'KC', 'NOR' : 'NO', 'NWE' : 'NE', 'LVR' : 'LV', 'SFO' : 'SF', 'TAM' : 'TB', 'OAK' : 'LV'}

In [27]:
new_stats.Team.replace(team_rename, inplace = True)

In [28]:
new_stats.Season.replace({'202' : '2020'}, inplace = True)

In [29]:
new_stats.head()

Unnamed: 0,Season,Team,Pts,PtsO,Cmp%,AY/A,Y/P,OnTgt%,DADOT,IAY/PA,Drop%,ADOT,DADOT.1,TD%,Int%,Rate,TO,CAY/PA
1,2023,ATL,321.0,373.0,61.7,5.85,5.2,73.9,8.6,8.2,4.9,7.9,8.6,3.2,3.2,80.5,28.0,3.8
2,2023,BUF,451.0,311.0,66.5,6.78,5.71,77.8,6.9,8.7,5.6,8.1,6.9,5.0,3.1,92.2,28.0,4.0
3,2023,CAR,236.0,416.0,59.7,4.35,4.11,72.9,7.3,7.6,5.2,6.9,7.3,2.2,1.7,75.2,20.0,3.1
4,2023,CHI,360.0,379.0,62.6,5.46,5.01,74.1,7.3,7.4,3.9,6.8,7.3,3.7,2.9,82.2,25.0,3.3
5,2023,CIN,366.0,384.0,68.3,6.19,5.17,77.0,9.4,6.3,5.0,6.0,9.4,4.4,2.3,93.0,16.0,3.1


In [30]:
home_w1 = week_1.drop(axis = 1, labels = 'Away Team')

In [31]:
away_w1 = week_1.drop(axis = 1, labels = 'Home Team')

In [32]:
home_w1['Team'] = home_w1['Home Team']

In [33]:
away_w1['Team'] = away_w1['Away Team']

In [34]:
home_w1_for_test = pd.merge(home_w1, new_stats, how = 'left', on = ['Team' , 'Season'])

In [35]:
away_w1_for_test = pd.merge(away_w1, new_stats, how = 'left', on = ['Team' , 'Season'])

In [36]:
home_w1_for_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 0 to 15
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Home Team         16 non-null     object 
 1   Total Score Open  16 non-null     float64
 2   Date              16 non-null     object 
 3   Season            16 non-null     object 
 4   Team              16 non-null     object 
 5   Pts               16 non-null     float64
 6   PtsO              16 non-null     float64
 7   Cmp%              16 non-null     float64
 8   AY/A              16 non-null     float64
 9   Y/P               16 non-null     float64
 10  OnTgt%            16 non-null     float64
 11  DADOT             16 non-null     float64
 12  IAY/PA            16 non-null     float64
 13  Drop%             16 non-null     float64
 14  ADOT              16 non-null     float64
 15  DADOT.1           16 non-null     float64
 16  TD%               16 non-null     float64
 17 

In [37]:
away_w1_for_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 0 to 15
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Away Team         16 non-null     object 
 1   Total Score Open  16 non-null     float64
 2   Date              16 non-null     object 
 3   Season            16 non-null     object 
 4   Team              16 non-null     object 
 5   Pts               16 non-null     float64
 6   PtsO              16 non-null     float64
 7   Cmp%              16 non-null     float64
 8   AY/A              16 non-null     float64
 9   Y/P               16 non-null     float64
 10  OnTgt%            16 non-null     float64
 11  DADOT             16 non-null     float64
 12  IAY/PA            16 non-null     float64
 13  Drop%             16 non-null     float64
 14  ADOT              16 non-null     float64
 15  DADOT.1           16 non-null     float64
 16  TD%               16 non-null     float64
 17 

In [38]:
trim_nfl = pd.read_csv('trim_nfl')

In [39]:
trim_nfl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4655 entries, 0 to 4654
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        4655 non-null   int64  
 1   Date              4655 non-null   object 
 2   Home Team         4655 non-null   object 
 3   Away Team         4655 non-null   object 
 4   Home Score        4655 non-null   int64  
 5   Away Score        4655 non-null   int64  
 6   Total Score Open  4655 non-null   float64
 7   Game_Total        4655 non-null   int64  
 8   Overtime          4655 non-null   object 
 9   year              4655 non-null   int64  
 10  month             4655 non-null   int64  
 11  season            4655 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 436.5+ KB


In [40]:
trim_nfl.drop(columns = 'Unnamed: 0', inplace = True)

In [41]:
home_df = trim_nfl.drop( axis = 1, labels = ['Away Team', 'Away Score', 'Home Score', 'Overtime', 'Game_Total', 'year', 'month']).copy()

In [42]:
home_df['Team'] = home_df['Home Team']
home_df.drop(axis = 1, labels = 'Home Team')

Unnamed: 0,Date,Total Score Open,season,Team
0,2024-01-07,50.0,2023,MIA
1,2024-01-07,47.5,2023,ARI
2,2024-01-07,37.0,2023,CAR
3,2024-01-07,40.0,2023,CIN
4,2024-01-07,44.0,2023,DET
...,...,...,...,...
4650,2006-09-10,44.0,2006,DET
4651,2006-09-10,36.5,2006,CLE
4652,2006-09-10,38.5,2006,CAR
4653,2006-09-10,43.5,2006,ARI


In [43]:
away_df = trim_nfl.drop( axis = 1, labels = ['Home Team', 'Home Score', 'Away Score','Overtime', 'Game_Total', 'year', 'month']).copy()

In [44]:
away_df['Team'] = away_df['Away Team']
away_df.drop(axis = 1, labels = 'Away Team')

Unnamed: 0,Date,Total Score Open,season,Team
0,2024-01-07,50.0,2023,BUF
1,2024-01-07,47.5,2023,SEA
2,2024-01-07,37.0,2023,TB
3,2024-01-07,40.0,2023,CLE
4,2024-01-07,44.0,2023,MIN
...,...,...,...,...
4650,2006-09-10,44.0,2006,SEA
4651,2006-09-10,36.5,2006,NO
4652,2006-09-10,38.5,2006,ATL
4653,2006-09-10,43.5,2006,SF


In [45]:
home_df['Season'] = home_df['season']
home_df.drop(columns = 'season', inplace = True)

In [46]:
away_df['Season'] = away_df['season']
away_df.drop(columns = 'season', inplace = True)

In [47]:
home_df.head()

Unnamed: 0,Date,Home Team,Total Score Open,Team,Season
0,2024-01-07,MIA,50.0,MIA,2023
1,2024-01-07,ARI,47.5,ARI,2023
2,2024-01-07,CAR,37.0,CAR,2023
3,2024-01-07,CIN,40.0,CIN,2023
4,2024-01-07,DET,44.0,DET,2023


In [48]:
new_stats.Team.unique()

array(['ATL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'IND', 'ARI', 'DAL',
       'DEN', 'DET', 'GB', 'HOU', 'JAX', 'KC', 'MIA', 'MIN', 'NO', 'NE',
       'NYG', 'NYJ', 'TEN', 'PHI', 'PIT', 'LV', 'LAR', 'BAL', 'LAC',
       'SEA', 'SF', 'TB', 'WAS'], dtype=object)

In [49]:
new_stats.Team.unique()

array(['ATL', 'BUF', 'CAR', 'CHI', 'CIN', 'CLE', 'IND', 'ARI', 'DAL',
       'DEN', 'DET', 'GB', 'HOU', 'JAX', 'KC', 'MIA', 'MIN', 'NO', 'NE',
       'NYG', 'NYJ', 'TEN', 'PHI', 'PIT', 'LV', 'LAR', 'BAL', 'LAC',
       'SEA', 'SF', 'TB', 'WAS'], dtype=object)

In [50]:
new_stats.Season.value_counts()

2023    32
2022    32
2021    32
2020    32
2019    32
Name: Season, dtype: int64

In [51]:
new_stats.Season.value_counts()

2023    32
2022    32
2021    32
2020    32
2019    32
Name: Season, dtype: int64

In [52]:
seasons_stats = new_stats.Season.unique()

In [53]:
seasons_stats

array(['2023', '2022', '2021', '2020', '2019'], dtype=object)

In [54]:
stat_year_list = seasons_stats.tolist()

In [55]:
home_df.Season.unique()


array([2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013,
       2012, 2011, 2010, 2009, 2008, 2007, 2006], dtype=int64)

In [56]:
home_df['Season'] = home_df['Season'].astype('string')

In [57]:
home_df['Season'] = home_df['Season'].astype('object')

In [58]:
rams = {'LA' : 'LAR'}

In [59]:
home_df.Team.replace(rams, inplace = True)

In [60]:
away_df.Team.replace(rams, inplace = True)

In [61]:
home_df.Team.unique()

array(['MIA', 'ARI', 'CAR', 'CIN', 'DET', 'GB', 'IND', 'LAC', 'LV', 'NE',
       'NO', 'NYG', 'SF', 'TEN', 'WAS', 'BAL', 'MIN', 'DEN', 'SEA', 'KC',
       'BUF', 'CHI', 'HOU', 'JAX', 'PHI', 'TB', 'DAL', 'CLE', 'ATL',
       'NYJ', 'PIT', 'LAR'], dtype=object)

In [62]:
home_df_stats = pd.merge(home_df, new_stats, how = 'left', on = ['Team', 'Season'])

In [63]:
home_df_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4655 entries, 0 to 4654
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date              4655 non-null   object 
 1   Home Team         4655 non-null   object 
 2   Total Score Open  4655 non-null   float64
 3   Team              4655 non-null   object 
 4   Season            4655 non-null   object 
 5   Pts               1327 non-null   float64
 6   PtsO              1327 non-null   float64
 7   Cmp%              1327 non-null   float64
 8   AY/A              1327 non-null   float64
 9   Y/P               1327 non-null   float64
 10  OnTgt%            1327 non-null   float64
 11  DADOT             1327 non-null   float64
 12  IAY/PA            1327 non-null   float64
 13  Drop%             1327 non-null   float64
 14  ADOT              1327 non-null   float64
 15  DADOT.1           1327 non-null   float64
 16  TD%               1327 non-null   float64


In [64]:
home_df_stats.dropna(inplace=True)

In [65]:
home_df_stats.head()

Unnamed: 0,Date,Home Team,Total Score Open,Team,Season,Pts,PtsO,Cmp%,AY/A,Y/P,...,DADOT,IAY/PA,Drop%,ADOT,DADOT.1,TD%,Int%,Rate,TO,CAY/PA
0,2024-01-07,MIA,50.0,MIA,2023,496.0,391.0,69.4,7.84,6.48,...,8.2,7.6,4.4,7.6,8.2,5.3,2.7,101.2,25.0,4.4
1,2024-01-07,ARI,47.5,ARI,2023,330.0,455.0,64.0,5.34,5.16,...,8.2,7.6,3.5,7.1,8.2,3.2,2.2,82.9,18.0,3.2
2,2024-01-07,CAR,37.0,CAR,2023,236.0,416.0,59.7,4.35,4.11,...,7.3,7.6,5.2,6.9,7.3,2.2,1.7,75.2,20.0,3.1
3,2024-01-07,CIN,40.0,CIN,2023,366.0,384.0,68.3,6.19,5.17,...,9.4,6.3,5.0,6.0,9.4,4.4,2.3,93.0,16.0,3.1
4,2024-01-07,DET,44.0,DET,2023,461.0,395.0,67.3,7.36,5.9,...,9.2,6.7,6.1,6.7,9.2,5.0,2.0,98.1,23.0,4.0


In [66]:
away_df['Season'] = away_df['Season'].astype('string')

In [67]:
away_df['Season'] = away_df['Season'].astype('object')

In [68]:
away_df_stats = pd.merge(away_df, new_stats, how = 'left', on = ['Team', 'Season'])

In [69]:
away_df_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4655 entries, 0 to 4654
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date              4655 non-null   object 
 1   Away Team         4655 non-null   object 
 2   Total Score Open  4655 non-null   float64
 3   Team              4655 non-null   object 
 4   Season            4655 non-null   object 
 5   Pts               1327 non-null   float64
 6   PtsO              1327 non-null   float64
 7   Cmp%              1327 non-null   float64
 8   AY/A              1327 non-null   float64
 9   Y/P               1327 non-null   float64
 10  OnTgt%            1327 non-null   float64
 11  DADOT             1327 non-null   float64
 12  IAY/PA            1327 non-null   float64
 13  Drop%             1327 non-null   float64
 14  ADOT              1327 non-null   float64
 15  DADOT.1           1327 non-null   float64
 16  TD%               1327 non-null   float64


In [70]:
away_df_stats.dropna(inplace=True)

In [71]:
away_df_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1327 entries, 0 to 1326
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date              1327 non-null   object 
 1   Away Team         1327 non-null   object 
 2   Total Score Open  1327 non-null   float64
 3   Team              1327 non-null   object 
 4   Season            1327 non-null   object 
 5   Pts               1327 non-null   float64
 6   PtsO              1327 non-null   float64
 7   Cmp%              1327 non-null   float64
 8   AY/A              1327 non-null   float64
 9   Y/P               1327 non-null   float64
 10  OnTgt%            1327 non-null   float64
 11  DADOT             1327 non-null   float64
 12  IAY/PA            1327 non-null   float64
 13  Drop%             1327 non-null   float64
 14  ADOT              1327 non-null   float64
 15  DADOT.1           1327 non-null   float64
 16  TD%               1327 non-null   float64


In [72]:
full_home = pd.concat([home_w1_for_test, home_df_stats], axis = 0)

In [73]:
full_home.head()

Unnamed: 0,Home Team,Total Score Open,Date,Season,Team,Pts,PtsO,Cmp%,AY/A,Y/P,...,DADOT,IAY/PA,Drop%,ADOT,DADOT.1,TD%,Int%,Rate,TO,CAY/PA
0,KC,46.5,2024-09-05 00:00:00,2023,KC,371.0,294.0,66.3,6.27,5.53,...,7.5,6.4,7.3,6.2,7.5,4.4,2.7,89.6,28.0,2.7
1,PHI,48.5,2024-09-06 00:00:00,2023,PHI,433.0,428.0,65.5,6.38,5.42,...,8.2,8.6,3.6,7.9,8.2,4.3,2.8,89.2,28.0,4.1
2,IND,49.0,2024-09-08 00:00:00,2023,IND,396.0,415.0,61.8,6.23,5.23,...,7.7,7.4,5.9,7.1,7.7,3.1,1.7,85.0,22.0,3.5
3,NO,41.5,2024-09-08 00:00:00,2023,NO,402.0,327.0,67.0,6.69,5.11,...,8.3,8.3,4.0,8.0,8.3,4.6,1.8,94.8,18.0,4.0
4,BUF,48.0,2024-09-08 00:00:00,2023,BUF,451.0,311.0,66.5,6.78,5.71,...,6.9,8.7,5.6,8.1,6.9,5.0,3.1,92.2,28.0,4.0


In [74]:
full_away = pd.concat([away_w1_for_test, away_df_stats], axis = 0)

In [75]:
full_away.head()

Unnamed: 0,Away Team,Total Score Open,Date,Season,Team,Pts,PtsO,Cmp%,AY/A,Y/P,...,DADOT,IAY/PA,Drop%,ADOT,DADOT.1,TD%,Int%,Rate,TO,CAY/PA
0,BAL,46.5,2024-09-05 00:00:00,2023,BAL,483.0,280.0,66.4,7.81,5.85,...,7.6,8.3,5.5,8.2,7.6,5.5,1.4,102.5,19.0,4.2
1,GB,48.5,2024-09-06 00:00:00,2023,GB,383.0,350.0,64.4,7.08,5.58,...,7.5,8.5,5.2,8.3,7.5,5.5,1.9,96.4,18.0,3.9
2,HOU,49.0,2024-09-08 00:00:00,2023,HOU,377.0,353.0,62.8,7.35,5.37,...,8.4,8.8,4.2,8.5,8.4,4.6,1.4,96.2,14.0,4.4
3,CAR,41.5,2024-09-08 00:00:00,2023,CAR,236.0,416.0,59.7,4.35,4.11,...,7.3,7.6,5.2,6.9,7.3,2.2,1.7,75.2,20.0,3.1
4,ARI,48.0,2024-09-08 00:00:00,2023,ARI,330.0,455.0,64.0,5.34,5.16,...,8.2,7.6,3.5,7.1,8.2,3.2,2.2,82.9,18.0,3.2


In [76]:
home_rename = {'Pts' : 'pts_x', 'PtsO' : 'pts_allowed_x', 'Cmp%' : 'cmp_pct_x', 'AY/A' : 'avgyd_att_x', 
               'Y/P' : 'yds_play_x', 'OnTgt%' : 'on_tgt_pct_x', 'DADOT' : 'dadot_x', 'IAY/PA' : 'intd_air_att_x', 'Drop%' : 'drop_pct_x',
               'ADOT' : 'adot_x', 'TD%' : 'td_pct_x', 'Int%' : 'int_pct_x', 'Rate' : 'rate_x', 'TO' : 'to_x', 'CAY/PA' : 'cmp_air_att_x'}

In [77]:
more_drops = ['DADOT.1']

In [78]:
away_rename =  {'Pts' : 'pts_y', 'PtsO' : 'pts_allowed_y', 'Cmp%' : 'cmp_pct_y', 'AY/A' : 'avgyd_att_y', 
               'Y/P' : 'yds_play_y', 'OnTgt%' : 'on_tgt_pct_y', 'DADOT' : 'dadot_y', 'IAY/PA' : 'intd_air_att_y', 'Drop%' : 'drop_pct_y',
               'ADOT' : 'adot_y', 'TD%' : 'td_pct_y', 'Int%' : 'int_pct_y', 'Rate' : 'rate_y', 'TO' : 'to_y', 'CAY/PA' : 'cmp_air_att_y'}

In [79]:
full_home.rename(columns = home_rename, inplace = True)

In [80]:
full_away.rename(columns = away_rename, inplace = True)

In [81]:
full_home.drop(columns = more_drops, inplace = True)

In [82]:
full_away.drop(columns = more_drops, inplace = True)

In [83]:
full_df = pd.concat([full_home, full_away], join = 'outer', axis = 1).reset_index(drop = True)

In [84]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1343 entries, 0 to 1342
Data columns (total 40 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Home Team         1343 non-null   object 
 1   Total Score Open  1343 non-null   float64
 2   Date              1343 non-null   object 
 3   Season            1343 non-null   object 
 4   Team              1343 non-null   object 
 5   pts_x             1343 non-null   float64
 6   pts_allowed_x     1343 non-null   float64
 7   cmp_pct_x         1343 non-null   float64
 8   avgyd_att_x       1343 non-null   float64
 9   yds_play_x        1343 non-null   float64
 10  on_tgt_pct_x      1343 non-null   float64
 11  dadot_x           1343 non-null   float64
 12  intd_air_att_x    1343 non-null   float64
 13  drop_pct_x        1343 non-null   float64
 14  adot_x            1343 non-null   float64
 15  td_pct_x          1343 non-null   float64
 16  int_pct_x         1343 non-null   float64


In [85]:
full_df = full_df.loc[:,~full_df.columns.duplicated()]

In [86]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1343 entries, 0 to 1342
Data columns (total 36 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Home Team         1343 non-null   object 
 1   Total Score Open  1343 non-null   float64
 2   Date              1343 non-null   object 
 3   Season            1343 non-null   object 
 4   Team              1343 non-null   object 
 5   pts_x             1343 non-null   float64
 6   pts_allowed_x     1343 non-null   float64
 7   cmp_pct_x         1343 non-null   float64
 8   avgyd_att_x       1343 non-null   float64
 9   yds_play_x        1343 non-null   float64
 10  on_tgt_pct_x      1343 non-null   float64
 11  dadot_x           1343 non-null   float64
 12  intd_air_att_x    1343 non-null   float64
 13  drop_pct_x        1343 non-null   float64
 14  adot_x            1343 non-null   float64
 15  td_pct_x          1343 non-null   float64
 16  int_pct_x         1343 non-null   float64


In [87]:
full_df.head()

Unnamed: 0,Home Team,Total Score Open,Date,Season,Team,pts_x,pts_allowed_x,cmp_pct_x,avgyd_att_x,yds_play_x,...,on_tgt_pct_y,dadot_y,intd_air_att_y,drop_pct_y,adot_y,td_pct_y,int_pct_y,rate_y,to_y,cmp_air_att_y
0,KC,46.5,2024-09-05 00:00:00,2023,KC,371.0,294.0,66.3,6.27,5.53,...,72.9,7.6,8.3,5.5,8.2,5.5,1.4,102.5,19.0,4.2
1,PHI,48.5,2024-09-06 00:00:00,2023,PHI,433.0,428.0,65.5,6.38,5.42,...,76.6,7.5,8.5,5.2,8.3,5.5,1.9,96.4,18.0,3.9
2,IND,49.0,2024-09-08 00:00:00,2023,IND,396.0,415.0,61.8,6.23,5.23,...,73.9,8.4,8.8,4.2,8.5,4.6,1.4,96.2,14.0,4.4
3,NO,41.5,2024-09-08 00:00:00,2023,NO,402.0,327.0,67.0,6.69,5.11,...,72.9,7.3,7.6,5.2,6.9,2.2,1.7,75.2,20.0,3.1
4,BUF,48.0,2024-09-08 00:00:00,2023,BUF,451.0,311.0,66.5,6.78,5.71,...,70.9,8.2,7.6,3.5,7.1,3.2,2.2,82.9,18.0,3.2


In [99]:

train = full_df.loc[full_df['Season'] != '2023'].copy()
test = full_df.loc[full_df['Season'] == '2023'].copy()
X_train = train.drop('Total Score Open', axis = 1)
y_train= train['Total Score Open'].copy()
X_test = test.drop('Total Score Open', axis = 1)
y_test = test['Total Score Open'].copy()

In [100]:
rf = RandomForestRegressor(n_estimators = 50, min_samples_split =10, random_state = 151)

In [101]:
predictors_1 = ['pts_x', 'pts_allowed_x', 'cmp_pct_x', 'avgyd_att_x', 'yds_play_x', 'on_tgt_pct_x', 'dadot_x', 'intd_air_att_x',
                'drop_pct_x', 'adot_x', 'td_pct_x',
                'int_pct_x', 'rate_x', 'to_x', 'cmp_air_att_x',  'pts_y', 'pts_allowed_y',
                'cmp_pct_y',  'avgyd_att_y',  'yds_play_y', 'on_tgt_pct_y', 'dadot_y', 'intd_air_att_y', 'drop_pct_y', 'adot_y', 
                'td_pct_y',  'int_pct_y' ,'rate_y', 'to_y']

In [102]:
rf.fit(X_train[predictors_1], y_train)

In [103]:
preds1 = rf.predict(X_test[predictors_1])

In [104]:
from sklearn import metrics


def metrics_test(test, preds):
    print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test, preds))
    print('Mean Squared Error (MSE):', metrics.mean_squared_error(test, preds))
    print('Root Mean Squared Error (RMSE):', metrics.mean_squared_error(test, preds, squared=False))
    print('Mean Absolute Percentage Error (MAPE):', metrics.mean_absolute_percentage_error(test, preds))
    print('Explained Variance Score:', metrics.explained_variance_score(test, preds))
    print('Max Error:', metrics.max_error(test, preds))
    print('Mean Squared Log Error:', metrics.mean_squared_log_error(test, preds))
    print('Median Absolute Error:', metrics.median_absolute_error(test, preds))
    print('R^2:', metrics.r2_score(test, preds))
    print('Mean Poisson Deviance:', metrics.mean_poisson_deviance(test, preds))
    print('Mean Gamma Deviance:', metrics.mean_gamma_deviance(test, preds))

In [105]:
metrics_test(y_test, preds1)

Mean Absolute Error (MAE): 3.1068601956663207
Mean Squared Error (MSE): 14.733337414091729
Root Mean Squared Error (RMSE): 3.83840297703247
Mean Absolute Percentage Error (MAPE): 0.07414282213144305
Explained Variance Score: 0.20389548225375342
Max Error: 10.86174418065594
Mean Squared Log Error: 0.007520299768805535
Median Absolute Error: 2.694437342134126
R^2: -0.06059406719042837
Mean Poisson Deviance: 0.33453971888577866
Mean Gamma Deviance: 0.007626213860533793


In [106]:
results_df = X_test[['Home Team', 'Away Team']]

In [109]:
results_df['Total Score Open'] = y_test.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['Total Score Open'] = y_test.copy()


In [110]:
results_df['pred_score_1'] = preds1.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['pred_score_1'] = preds1.copy()


In [111]:
compare_odds_df = results_df[['Total Score Open', 'pred_score_1', 'Home Team', 'Away Team']].copy()

In [116]:
compare_odds_df.head(16)

Unnamed: 0,Total Score Open,pred_score_1,Home Team,Away Team
0,46.5,46.944407,KC,BAL
1,48.5,50.068908,PHI,GB
2,49.0,48.593681,IND,HOU
3,41.5,44.630631,NO,CAR
4,48.0,45.554117,BUF,ARI
5,41.5,43.046444,NYG,MIN
6,40.5,41.984348,CIN,NE
7,42.0,43.815106,ATL,PIT
8,49.0,48.671738,MIA,JAX
9,44.5,42.637633,CHI,TEN


In [115]:
compare_odds_df.iloc[16:33]

Unnamed: 0,Total Score Open,pred_score_1,Home Team,Away Team
16,50.0,49.833129,MIA,BUF
17,47.5,44.757838,ARI,SEA
18,37.0,42.201495,CAR,TB
19,40.0,46.039866,CIN,CLE
20,44.0,48.189359,DET,MIN
21,44.0,45.135261,GB,CHI
22,46.5,48.593681,IND,HOU
23,35.5,44.799136,LAC,KC
24,38.0,43.939772,LV,DEN
25,37.5,41.111195,NE,NYJ
