### NFL Predictive Model

In [113]:
# %matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import seaborn as sns

import statsmodels as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline 
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, classification_report
from sklearn.metrics import precision_recall_curve, confusion_matrix, roc_auc_score

#### Import NFL box scores data
- 57 columns in data set
- spans 17 seasons: 2001 -2017
- 2017 looks incomplete, need to acquire it from another source
- source: [Kaggle: NFL Box Scores](https://www.kaggle.com/grayengineering425/nfl-box-scores)

In [114]:
df_box = pd.read_csv('data/nfl_box_scores.csv')
df_box.shape

(4328, 57)

In [115]:
df_box.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4328 entries, 0 to 4327
Data columns (total 57 columns):
date                           4328 non-null object
visitor                        4328 non-null object
home                           4328 non-null object
visitor_score                  4328 non-null int64
home_score                     4328 non-null int64
visitor_first_downs            4328 non-null int64
visitor_rushing_first_downs    4328 non-null int64
visitor_passing_first_downs    4328 non-null int64
visitor_penalties              4328 non-null int64
visitor_net_yards              4328 non-null int64
visitor_net_yards_rushing      4328 non-null int64
visitor_rushing_plays          4328 non-null int64
visitor_avg_rush               4328 non-null float64
visitor_net_yards_passing      4328 non-null int64
visitor_passing_splits         4328 non-null object
visitor_sack_splits            4328 non-null object
visitor_gross_passing          4328 non-null int64
visitor_yards_per_p

In [116]:
df_box.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_rushing_first_downs,visitor_passing_first_downs,visitor_penalties,visitor_net_yards,...,home_kick_return_splits,home_int_return_splits,home_penalty_splits,home_fumble_splits,home_field_goals,home_third_down_splits,home_fourth_down_splits,home_total_plays,home_avg_gain,home_time_of_possession
0,"September 7, 2014",Cleveland,Pittsburgh,27,30,23,9,11,3,389,...,2-29,0-0,11-96,1-0,3-3,4-12-33%,1-1-100%,67,7.5,32:27
1,"September 7, 2014",Jacksonville,Philadelphia,17,34,18,2,14,2,306,...,1-24,0-0,6-50,3-2,2-2,8-19-42%,1-1-100%,82,5.1,30:46
2,"September 4, 2014",Green Bay,Seattle,16,36,19,4,13,2,255,...,3-60,1-21,4-69,2-1,2-2,4-11-36%,1-1-100%,66,6.0,33:20
3,"September 7, 2014",Minnesota,St. Louis,34,6,18,6,10,2,355,...,1-26,0-0,13-121,4-0,2-3,4-14-28%,0-0-0%,63,5.0,31:43
4,"September 7, 2014",Cincinnati,Baltimore,23,16,16,4,11,1,380,...,4-109,0-0,3-29,2-1,1-2,8-17-47%,1-2-50%,85,5.0,29:30


#### Clean box score data
- convert date to datetime
- sort by date
- add a column identifying the season according to the year it begins
- example: the 2016 season can run from September 2016 to February 2017
- drop the 2017 season, it's incomplete
- add columns for home and away team IDs
- add a column identifying the game by its game_id
- extract the numbers from the "splits" columns

In [117]:
df_box['date'] = pd.to_datetime(df_box['date'])

In [118]:
#sort by date
df_box.sort_values(by='date', ascending=True, inplace=True)
df_box.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_rushing_first_downs,visitor_passing_first_downs,visitor_penalties,visitor_net_yards,...,home_kick_return_splits,home_int_return_splits,home_penalty_splits,home_fumble_splits,home_field_goals,home_third_down_splits,home_fourth_down_splits,home_total_plays,home_avg_gain,home_time_of_possession
1429,2001-09-09,New Orleans,Buffalo,24,6,17,7,9,1,301,...,5-77,0-0,4-35,0-0,2-2,5-15-33%,0-3-0%,61,4.1,32:27
1200,2001-09-09,Seattle,Cleveland,9,6,16,7,7,2,251,...,3-28,2-9,4-43,0-0,2-3,8-17-47%,0-0-0%,61,3.9,29:52
1430,2001-09-09,Chicago,Baltimore,6,17,16,4,10,2,189,...,3-70,2-36,6-55,3-2,1-2,4-10-40%,0-1-0%,60,5.3,25:18
1431,2001-09-09,Miami,Tennessee,31,23,11,4,7,0,307,...,3-59,0-0,12-77,0-0,1-1,4-14-28%,0-0-0%,67,4.7,28:53
1434,2001-09-09,St. Louis,Philadelphia,20,17,22,4,18,0,364,...,4-114,2-0,2-20,2-2,1-1,5-15-33%,3-3-100%,71,4.7,32:44


In [119]:
#2017 season is incomplete, ends in October
df_box.tail()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_rushing_first_downs,visitor_passing_first_downs,visitor_penalties,visitor_net_yards,...,home_kick_return_splits,home_int_return_splits,home_penalty_splits,home_fumble_splits,home_field_goals,home_third_down_splits,home_fourth_down_splits,home_total_plays,home_avg_gain,home_time_of_possession
875,2017-10-01,LA Rams,Dallas,35,30,24,9,14,1,412,...,3-78,0-0,4-45,1-1,1-1,7-14-50%,1-2-50%,65,6.8,29:28
876,2017-10-01,Cincinnati,Cleveland,31,7,25,8,15,2,350,...,5-116,0-0,4-28,1-0,0-1,5-16-31%,1-2-50%,61,3.5,24:58
873,2017-10-01,Carolina,New England,33,30,28,7,18,3,444,...,1-23,1--2,7-55,0-0,3-3,7-14-50%,2-2-100%,67,5.6,31:07
885,2017-10-02,Washington,Kansas City,20,29,15,4,10,1,331,...,2-43,0-0,3-15,2-0,3-4,8-13-61%,0-0-0%,72,6.0,37:09
884,2017-10-05,New England,Tampa Bay,19,14,23,5,15,3,402,...,1-16,1-2,9-70,2-0,0-3,4-13-30%,0-0-0%,68,6.0,29:10


In [120]:
#add a column identifying the season according to the year it begins
df_box['season'] = np.where(df_box['date'].dt.month > 2, df_box['date'].dt.year, df_box['date'].dt.year-1)

In [121]:
#drop the 2017 season data
df_box = df_box[df_box.season <= 2016]
df_box.tail()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_rushing_first_downs,visitor_passing_first_downs,visitor_penalties,visitor_net_yards,...,home_int_return_splits,home_penalty_splits,home_fumble_splits,home_field_goals,home_third_down_splits,home_fourth_down_splits,home_total_plays,home_avg_gain,home_time_of_possession,season
3691,2017-01-15,Green Bay,Dallas,34,31,27,6,18,3,413,...,1-27,6-50,0-0,3-3,6-11-54%,0-0-0%,64,6.7,30:14,2016
3694,2017-01-15,Pittsburgh,Kansas City,18,16,20,10,10,0,389,...,1-0,6-60,1-1,1-1,2-9-22%,2-2-100%,49,4.6,25:47,2016
3695,2017-01-22,Green Bay,Atlanta,21,44,24,5,18,1,367,...,1-0,4-31,2-0,1-1,10-13-76%,0-0-0%,68,7.2,33:39,2016
3698,2017-01-22,Pittsburgh,New England,17,36,22,3,18,1,368,...,1-37,2-10,0-0,3-3,11-17-64%,0-0-0%,71,6.1,31:26,2016
3696,2017-02-05,New England,Atlanta,34,28,37,7,26,4,546,...,1-82,9-65,1-1,0-0,1-8-12%,0-0-0%,46,7.5,23:27,2016


In [122]:
#create dictionary to map short team names to team IDs
#--> use the team IDs from the NFL scrapeR data, after fixing JAC/JAX
dict_team_ids1 = {'Arizona': 'ARI', 'Atlanta': 'ATL', 'Baltimore': 'BAL', 'Buffalo': 'BUF', 'Carolina': 'CAR',
                  'Chicago': 'CHI', 'Cincinnati': 'CIN', 'Cleveland': 'CLE', 'Dallas': 'DAL', 'Denver': 'DEN',
                  'Detroit': 'DET', 'Green Bay': 'GB', 'Houston': 'HOU', 'Indianapolis': 'IND',
                  'Jacksonville': 'JAX', 'Kansas City': 'KC', 'Los Angeles': 'LA', 'Los Angeles Chargers': 'LAC',
                  'Miami': 'MIA', 'Minnesota': 'MIN', 'NY Giants': 'NYG', 'NY Jets': 'NYJ', 'New England': 'NE',
                  'New Orleans': 'NO', 'Oakland': 'OAK', 'Philadelphia': 'PHI', 'Pittsburgh': 'PIT',
                  'San Diego': 'SD', 'San Francisco': 'SF', 'Seattle': 'SEA', 'St. Louis': 'STL',
                  'Tampa Bay': 'TB', 'Tennessee': 'TEN', 'Washington': 'WAS'}

In [123]:
#add columns for home and away(visitor) team IDs
df_box['home_team_id'] = df_box.home.map(dict_team_ids1)
df_box['visitor_team_id'] = df_box.visitor.map(dict_team_ids1)

In [124]:
#add game_id column to identify games
# --> game_id format: yyyymmdd{away team id}{home team id}
# --> this differs from the game IDs in the nfl scrapeR dataset
df_box['game_id'] = df_box.date.dt.strftime('%Y%m%d')+ df_box['visitor_team_id'] + df_box['home_team_id']

In [146]:
#set index to game_id
df_box.set_index('game_id', inplace=True)

In [147]:
df_box.head()

Unnamed: 0_level_0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_rushing_first_downs,visitor_passing_first_downs,visitor_penalties,visitor_net_yards,...,home_fumble_splits,home_field_goals,home_third_down_splits,home_fourth_down_splits,home_total_plays,home_avg_gain,home_time_of_possession,season,home_team_id,visitor_team_id
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20010909NOBUF,2001-09-09,New Orleans,Buffalo,24,6,17,7,9,1,301,...,0-0,2-2,5-15-33%,0-3-0%,61,4.1,32:27,2001,BUF,NO
20010909SEACLE,2001-09-09,Seattle,Cleveland,9,6,16,7,7,2,251,...,0-0,2-3,8-17-47%,0-0-0%,61,3.9,29:52,2001,CLE,SEA
20010909CHIBAL,2001-09-09,Chicago,Baltimore,6,17,16,4,10,2,189,...,3-2,1-2,4-10-40%,0-1-0%,60,5.3,25:18,2001,BAL,CHI
20010909MIATEN,2001-09-09,Miami,Tennessee,31,23,11,4,7,0,307,...,0-0,1-1,4-14-28%,0-0-0%,67,4.7,28:53,2001,TEN,MIA
20010909STLPHI,2001-09-09,St. Louis,Philadelphia,20,17,22,4,18,0,364,...,2-2,1-1,5-15-33%,3-3-100%,71,4.7,32:44,2001,PHI,STL


In [126]:
df_box.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4264 entries, 1429 to 3696
Data columns (total 61 columns):
date                           4264 non-null datetime64[ns]
visitor                        4264 non-null object
home                           4264 non-null object
visitor_score                  4264 non-null int64
home_score                     4264 non-null int64
visitor_first_downs            4264 non-null int64
visitor_rushing_first_downs    4264 non-null int64
visitor_passing_first_downs    4264 non-null int64
visitor_penalties              4264 non-null int64
visitor_net_yards              4264 non-null int64
visitor_net_yards_rushing      4264 non-null int64
visitor_rushing_plays          4264 non-null int64
visitor_avg_rush               4264 non-null float64
visitor_net_yards_passing      4264 non-null int64
visitor_passing_splits         4264 non-null object
visitor_sack_splits            4264 non-null object
visitor_gross_passing          4264 non-null int64
visitor_

#### Import the dataset containing historical point spreads and over-under lines
- source: [Kaggle: NFL scores and betting data](https://www.kaggle.com/tobycrabtree/nfl-scores-and-betting-data)

In [127]:
df_lines = pd.read_csv('data/nfl_spreadspoke_scores.csv')
df_lines.shape

(12400, 18)

In [128]:
df_lines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12400 entries, 0 to 12399
Data columns (total 18 columns):
schedule_date          12400 non-null object
schedule_season        12400 non-null int64
schedule_week          12400 non-null object
team_home              12400 non-null object
team_away              12400 non-null object
stadium                12400 non-null object
team_favorite_id       9665 non-null object
spread_favorite        9665 non-null float64
over_under_line        9655 non-null object
weather_detail         4425 non-null object
weather_temperature    11534 non-null float64
weather_wind_mph       11534 non-null float64
weather_humidity       8366 non-null object
score_home             12144 non-null float64
score_away             12144 non-null float64
stadium_neutral        12400 non-null bool
schedule_playoff       12400 non-null bool
game_id                11475 non-null object
dtypes: bool(2), float64(5), int64(1), object(10)
memory usage: 1.5+ MB


In [129]:
df_lines.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,team_away,stadium,team_favorite_id,spread_favorite,over_under_line,weather_detail,weather_temperature,weather_wind_mph,weather_humidity,score_home,score_away,stadium_neutral,schedule_playoff,game_id
0,09/02/1966,1966,1,Miami Dolphins,Oakland Raiders,Orange Bowl,,,,,83.0,6.0,71,14.0,23.0,False,False,19660902OAKMIA
1,09/03/1966,1966,1,Houston Oilers,Denver Broncos,Rice Stadium,,,,,81.0,7.0,70,45.0,7.0,False,False,19660903DENTEN
2,09/04/1966,1966,1,San Diego Chargers,Buffalo Bills,Balboa Stadium,,,,,70.0,7.0,82,27.0,7.0,False,False,19660904BUFLAC
3,09/09/1966,1966,2,Miami Dolphins,New York Jets,Orange Bowl,,,,,82.0,11.0,78,14.0,19.0,False,False,19660909NYJMIA
4,09/10/1966,1966,2,San Diego Chargers,New England Patriots,Balboa Stadium,,,,,69.0,9.0,81,24.0,0.0,False,False,19660910NELAC


#### Clean the point spreads data
- drop all seasons before 2001 to align with the box score data
- convert date to datetime, sort by date
- add columns for home and away team IDs
- clean up the game_id and favorite_team_id columns

In [130]:
df_lines = df_lines[(df_lines.schedule_season >= 2001) & (df_lines.schedule_season <= 2017)]

In [131]:
df_lines['schedule_date'] = pd.to_datetime(df_lines['schedule_date'])

In [132]:
df_lines.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4531 entries, 7613 to 12143
Data columns (total 18 columns):
schedule_date          4531 non-null datetime64[ns]
schedule_season        4531 non-null int64
schedule_week          4531 non-null object
team_home              4531 non-null object
team_away              4531 non-null object
stadium                4531 non-null object
team_favorite_id       4531 non-null object
spread_favorite        4531 non-null float64
over_under_line        4531 non-null object
weather_detail         1173 non-null object
weather_temperature    4395 non-null float64
weather_wind_mph       4395 non-null float64
weather_humidity       2430 non-null object
score_home             4531 non-null float64
score_away             4531 non-null float64
stadium_neutral        4531 non-null bool
schedule_playoff       4531 non-null bool
game_id                4088 non-null object
dtypes: bool(2), datetime64[ns](1), float64(5), int64(1), object(9)
memory usage: 610.6+ 

In [133]:
#sort by date
df_lines.sort_values(by='schedule_date', ascending=True, inplace=True)
df_lines.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,team_away,stadium,team_favorite_id,spread_favorite,over_under_line,weather_detail,weather_temperature,weather_wind_mph,weather_humidity,score_home,score_away,stadium_neutral,schedule_playoff,game_id
7613,2001-09-09,2001,1,Kansas City Chiefs,Oakland Raiders,Arrowhead Stadium,OAK,-3.0,43.0,,64.0,10.0,78,24.0,27.0,False,False,20010909OAKKC
7626,2001-09-09,2001,1,Philadelphia Eagles,St. Louis Rams,Veterans Stadium,LAR,-3.0,46.0,,76.0,8.0,74,17.0,20.0,False,False,20010909LARPHI
7625,2001-09-09,2001,1,Dallas Cowboys,Tampa Bay Buccaneers,Texas Stadium,TB,-9.0,34.0,,75.0,13.0,78,6.0,10.0,False,False,20010909TBDAL
7624,2001-09-09,2001,1,Buffalo Bills,New Orleans Saints,Ralph Wilson Stadium,NO,-1.5,37.5,,80.0,12.0,58,6.0,24.0,False,False,20010909NOBUF
7622,2001-09-09,2001,1,Cincinnati Bengals,New England Patriots,Paul Brown Stadium,PICK,0.0,36.0,,75.0,11.0,84,23.0,17.0,False,False,20010909NECIN


In [134]:
#create dictionary to map long team names to team IDs
#--> use the team IDs from the NFL scrapeR data, after fixing JAC/JAX
dict_team_ids2 = {'Arizona Cardinals': 'ARI', 'Atlanta Falcons': 'ATL', 'Baltimore Ravens': 'BAL',
                  'Buffalo Bills': 'BUF', 'Carolina Panthers': 'CAR', 'Chicago Bears': 'CHI',
                  'Cincinnati Bengals': 'CIN', 'Cleveland Browns': 'CLE', 'Dallas Cowboys': 'DAL',
                  'Denver Broncos': 'DEN', 'Detroit Lions': 'DET', 'Green Bay Packers': 'GB',
                  'Houston Texans': 'HOU', 'Indianapolis Colts': 'IND', 'Jacksonville Jaguars': 'JAX',
                  'Kansas City Chiefs': 'KC', 'Los Angeles Chargers': 'LAC', 'Los Angeles Rams': 'LA',
                  'Miami Dolphins': 'MIA', 'Minnesota Vikings': 'MIN', 'New England Patriots': 'NE',
                  'New Orleans Saints': 'NO', 'New York Giants': 'NYG', 'New York Jets': 'NYJ',
                  'Oakland Raiders': 'OAK', 'Philadelphia Eagles': 'PHI', 'Pittsburgh Steelers': 'PIT',
                  'San Diego Chargers': 'SD', 'San Francisco 49ers': 'SF', 'Seattle Seahawks': 'SEA',
                  'St. Louis Rams': 'STL', 'Tampa Bay Buccaneers': 'TB', 'Tennessee Titans': 'TEN',
                  'Washington Redskins': 'WAS'}

In [135]:
#add columns for home and away(visitor) team IDs
df_lines['home_team_id'] = df_lines.team_home.map(dict_team_ids2)
df_lines['away_team_id'] = df_lines.team_away.map(dict_team_ids2)

In [136]:
#overwrite existing game_id column, to align with box score dataset
# --> game_id format: yyyymmdd{away team id}{home team id}
# --> this differs from the game IDs in the nfl scrapeR dataset
df_lines.game_id = df_lines.schedule_date.dt.strftime('%Y%m%d')+ df_lines['away_team_id'] + df_lines['home_team_id']

In [137]:
#similarly, create new favorite_team_id column, to align with box score dataset
#--> create map of team names to old favorite ID's (existing team_favorite_id)
dict3 = {'Arizona Cardinals': 'ARI', 'Atlanta Falcons': 'ATL', 'Baltimore Ravens': 'BAL',
                  'Buffalo Bills': 'BUF', 'Carolina Panthers': 'CAR', 'Chicago Bears': 'CHI',
                  'Cincinnati Bengals': 'CIN', 'Cleveland Browns': 'CLE', 'Dallas Cowboys': 'DAL',
                  'Denver Broncos': 'DEN', 'Detroit Lions': 'DET', 'Green Bay Packers': 'GB',
                  'Houston Texans': 'HOU', 'Indianapolis Colts': 'IND', 'Jacksonville Jaguars': 'JAX',
                  'Kansas City Chiefs': 'KC', 'Los Angeles Chargers': 'LAC', 'Los Angeles Rams': 'LAR',
                  'Miami Dolphins': 'MIA', 'Minnesota Vikings': 'MIN', 'New England Patriots': 'NE',
                  'New Orleans Saints': 'NO', 'New York Giants': 'NYG', 'New York Jets': 'NYJ',
                  'Oakland Raiders': 'OAK', 'Philadelphia Eagles': 'PHI', 'Pittsburgh Steelers': 'PIT',
                  'San Diego Chargers': 'LAC', 'San Francisco 49ers': 'SF', 'Seattle Seahawks': 'SEA',
                  'St. Louis Rams': 'LAR', 'Tampa Bay Buccaneers': 'TB', 'Tennessee Titans': 'TEN',
                  'Washington Redskins': 'WAS'}

df_lines['favorite_team_id'] = np.where(df_lines.team_home.map(dict3) == df_lines.team_favorite_id,
                                     df_lines.home_team_id, df_lines.away_team_id)
df_lines['favorite_team_id'] = np.where(df_lines.spread_favorite == 0.0, 'PICK', df_lines['favorite_team_id'])

h_ids = df_lines.home_team_id.unique()
h_ids.sort()
print(h_ids)
print(len(h_ids))

f_ids = df_lines['favorite_team_id'].unique()
f_ids.sort()
print(f_ids)
print(len(f_ids))

['ARI' 'ATL' 'BAL' 'BUF' 'CAR' 'CHI' 'CIN' 'CLE' 'DAL' 'DEN' 'DET' 'GB'
 'HOU' 'IND' 'JAX' 'KC' 'LA' 'LAC' 'MIA' 'MIN' 'NE' 'NO' 'NYG' 'NYJ' 'OAK'
 'PHI' 'PIT' 'SD' 'SEA' 'SF' 'STL' 'TB' 'TEN' 'WAS']
34
['ARI' 'ATL' 'BAL' 'BUF' 'CAR' 'CHI' 'CIN' 'CLE' 'DAL' 'DEN' 'DET' 'GB'
 'HOU' 'IND' 'JAX' 'KC' 'LA' 'LAC' 'MIA' 'MIN' 'NE' 'NO' 'NYG' 'NYJ' 'OAK'
 'PHI' 'PICK' 'PIT' 'SD' 'SEA' 'SF' 'STL' 'TB' 'TEN' 'WAS']
35


In [138]:
#drop the old column of favorite team ID's
df_lines.drop('team_favorite_id', axis=1, inplace=True)

In [148]:
#set index to game_id
df_lines.set_index('game_id', inplace=True)

In [149]:
df_lines.head()

Unnamed: 0_level_0,schedule_date,schedule_season,schedule_week,team_home,team_away,stadium,spread_favorite,over_under_line,weather_detail,weather_temperature,weather_wind_mph,weather_humidity,score_home,score_away,stadium_neutral,schedule_playoff,home_team_id,away_team_id,favorite_team_id
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
20010909OAKKC,2001-09-09,2001,1,Kansas City Chiefs,Oakland Raiders,Arrowhead Stadium,-3.0,43.0,,64.0,10.0,78,24.0,27.0,False,False,KC,OAK,OAK
20010909STLPHI,2001-09-09,2001,1,Philadelphia Eagles,St. Louis Rams,Veterans Stadium,-3.0,46.0,,76.0,8.0,74,17.0,20.0,False,False,PHI,STL,STL
20010909TBDAL,2001-09-09,2001,1,Dallas Cowboys,Tampa Bay Buccaneers,Texas Stadium,-9.0,34.0,,75.0,13.0,78,6.0,10.0,False,False,DAL,TB,TB
20010909NOBUF,2001-09-09,2001,1,Buffalo Bills,New Orleans Saints,Ralph Wilson Stadium,-1.5,37.5,,80.0,12.0,58,6.0,24.0,False,False,BUF,NO,NO
20010909NECIN,2001-09-09,2001,1,Cincinnati Bengals,New England Patriots,Paul Brown Stadium,0.0,36.0,,75.0,11.0,84,23.0,17.0,False,False,CIN,NE,PICK


In [150]:
df_lines.tail()

Unnamed: 0_level_0,schedule_date,schedule_season,schedule_week,team_home,team_away,stadium,spread_favorite,over_under_line,weather_detail,weather_temperature,weather_wind_mph,weather_humidity,score_home,score_away,stadium_neutral,schedule_playoff,home_team_id,away_team_id,favorite_team_id
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
20180114JAXPIT,2018-01-14,2017,Division,Pittsburgh Steelers,Jacksonville Jaguars,Heinz Field,-7.0,40.5,,18.0,2.0,,42.0,45.0,False,True,PIT,JAX,PIT
20180114NOMIN,2018-01-14,2017,Division,Minnesota Vikings,New Orleans Saints,U.S. Bank Stadium,-5.5,46.5,DOME,72.0,0.0,,29.0,24.0,False,True,MIN,NO,MIN
20180121MINPHI,2018-01-21,2017,Conference,Philadelphia Eagles,Minnesota Vikings,Lincoln Financial Field,-3.0,39.0,,46.0,1.0,,38.0,7.0,False,True,PHI,MIN,MIN
20180121JAXNE,2018-01-21,2017,Conference,New England Patriots,Jacksonville Jaguars,Gillette Stadium,-7.5,46.0,,48.0,2.0,,24.0,20.0,False,True,NE,JAX,NE
20180204PHINE,2018-02-04,2017,Superbowl,New England Patriots,Philadelphia Eagles,U.S. Bank Stadium,-4.5,48.5,DOME,72.0,0.0,,33.0,41.0,True,True,NE,PHI,NE


In [151]:
df_lines.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4531 entries, 20010909OAKKC to 20180204PHINE
Data columns (total 19 columns):
schedule_date          4531 non-null datetime64[ns]
schedule_season        4531 non-null int64
schedule_week          4531 non-null object
team_home              4531 non-null object
team_away              4531 non-null object
stadium                4531 non-null object
spread_favorite        4531 non-null float64
over_under_line        4531 non-null object
weather_detail         1173 non-null object
weather_temperature    4395 non-null float64
weather_wind_mph       4395 non-null float64
weather_humidity       2430 non-null object
score_home             4531 non-null float64
score_away             4531 non-null float64
stadium_neutral        4531 non-null bool
schedule_playoff       4531 non-null bool
home_team_id           4531 non-null object
away_team_id           4531 non-null object
favorite_team_id       4531 non-null object
dtypes: bool(2), datetime64[ns](1

#### TODO:
- join the box score and point spreads data sets by the game ID index column
- extract the numbers from the "splits" columns in box score data
- recreate the 2017 box score season data from another date source (nfl scrapeR dataset on Kaggle)
- independently verify point spread and over-under data by scraping pro football reference site
- get ANY/A stat from pro football reference, if it's available weekly

In [None]:
#import NFL scrapeR data (play by play)
#df_plays = pd.read_csv('data/nfl_play_by_play_2009-2017.csv')
#df_plays.shape

#the NFL scrapeR data seems to have 2 team ID's for Jacksonville (JAC, JAX)
#team_names3 = df_plays.HomeTeam.unique()
#team_names3.sort()