## Weather Data on NFL games for seasons 2009-2013
#### Matthew Johnson, August 9, 2018 (Last updated: August 13, 2018)

1. NFL data
2. Vegas odds
3. **Weather data**
<br><br>

Weather data is from http://nflsavant.com/about.php and originally contained weather for games between 1960-2013.

In [365]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

weather = pd.read_csv('/Users/mattjohnson/Desktop/Python2018/NFL/weather_20131231.csv')

We have weather for 1,119 games between 2009-2013:

In [421]:
weather[ weather['Season']>= 2009].shape

(1119, 12)

In [366]:
abbrev_dict = {"Arizona Cardinals": "ARI", "San Diego Chargers": "SD", "Minnesota Vikings": "MIN",
 "Miami Dolphins": "MIA", "Tennessee Titans": "TEN", "Carolina Panthers": "CAR",
 "New York Giants": "NYG", "Pittsburgh Steelers": "PIT", "Houston Texans": "HOU",
 "Seattle Seahawks": "SEA", "Cleveland Browns": "CLE", "Atlanta Falcons": "ATL",
 "Baltimore Ravens": "BAL", "Green Bay Packers": "GB", "Indianapolis Colts": "IND",
 "New Orleans Saints": "NO", "Cincinnati Bengals": "CIN", "Tampa Bay Buccaneers": "TB",
 "Oakland Raiders": "OAK", "New England Patriots": "NE", "Dallas Cowboys": "DAL",
 "Chicago Bears": "CHI", "Kansas City Chiefs": "KC", "Washington Redskins": "WAS",
 "San Francisco 49ers": "SF", "Buffalo Bills": "BUF", "New York Jets": "NYJ",
 "Philadelphia Eagles": "PHI", "Denver Broncos": "DEN", "Jacksonville Jaguars": "JAX",
 "Detroit Lions": "DET", "St. Louis Rams": "STL"}

In [477]:
home_columns = ['home_team', 'away_team','home_score', 'away_score', 'temperature',
       'wind_chill', 'humidity', 'wind_mph', 'date', 'Season', 'home_team']
away_columns = ['away_team', 'home_team', 'away_score', 'home_score', 'temperature',
       'wind_chill', 'humidity', 'wind_mph', 'date', 'Season', 'home_team']
all_columns = ['offenseTeam', 'defenseTeam','off_score', 'def_score', 'temperature',
       'wind_chill', 'humidity', 'wind_mph', 'date', 'Season', 'home_team']
    
def clean_weather_data(df):
    df['Season'] = pd.to_numeric(df['date'].str[-4:])
    df = df[ df['Season'] >= 2009]
    df['date'] = df['date'].str[:-5]
    df['wind_chill'] = df['wind_chill'].fillna(df['temperature'])
    df['humidity'] = pd.to_numeric(df['humidity'].str[:-1])
    df['humidity'] = df['humidity'].fillna(df['humidity'].mean())
    df['humidity'] = df['humidity'].round(0)
    df['wind_mph'] = pd.to_numeric(df['wind_mph']).fillna(0.0)
    df.drop(['id','weather'], axis=1, inplace=True)
    df['home_team'] = df['home_team'].map(abbrev_dict)
    df['away_team'] = df['away_team'].map(abbrev_dict)
    df.reset_index(drop=True, inplace=True)
    
    homedf, awaydf = df[home_columns], df[away_columns]
    df_list = [homedf, awaydf]
    for df in df_list:
        df.columns = all_columns 
        
    all_data = pd.concat([homedf.reset_index(drop=True), awaydf.reset_index(drop=True)])
            
    df = add_features(all_data)

    return df

def add_features(all_data):

    tempdf = all_data['date'].str.split('/').apply(pd.Series)
    
    for col in tempdf.columns:
        tempdf[col] =  tempdf[col].astype(str).apply(lambda x: x.zfill(2))
    
    all_data['date'] = tempdf[0] + '/' + tempdf[1]
    
    all_data['Year'] = np.where(tempdf[0].astype('int64') < 4, all_data['Season']+1, all_data['Season'])

    all_data['Season'] = all_data['Season'].astype(str)
    all_data['Year'] = all_data['Year'].astype(str)

    all_data['Date'] = all_data['date'] + '/'+ all_data['Year']

    all_data['mergeCode'] = all_data['Season'] + '-' + all_data['date'] + '-' + all_data['offenseTeam']
    
    all_data = sort_by_week(all_data)
    
    all_data['mergeCode2'] = all_data['Season']+'-'+ all_data['Week'].astype(str)+'-' + all_data['offenseTeam']
    
    return all_data

def create_mask(df, start_date, end_date):
    if end_date == 0:
        end_date = start_date + pd.to_timedelta(7,unit='d')
    mask = (df['Date'] > start_date) & (df['Date'] <= end_date)
    return df.loc[mask]

def sort_by_week(data):
    
    week_list = []
    data['Week'] = 0

    for year in range(2009,2014):
        year_df = data[ data['Season']==str(year)]
        # sort by week
        year_df['Date'] = pd.to_datetime(year_df['Date'])
        mondays = list(year_df.resample('W-Mon', on='Date').sum().reset_index().sort_values(by='Date')['Date'].values)
        mondays.append(0)

        for i in range(0,min(17,len(mondays)-1)):
            
            current_week = create_mask(year_df, mondays[i], mondays[i+1])
            current_week['Week'] = (i+1) 
            week_list.append(current_week)
    week_data = pd.concat(week_list)
    
    return week_data

## Start Here

In [488]:
all_data = clean_weather_data(weather)

home: 1119 away: 1119
all: 2238
all_data mid add_feats: 2238
all_data after add_feats: 2058


In [644]:
# all_data.to_csv('weather_data09to13.csv')

There are 1428/2 = 714 games that are within the weeks 6 through 17:

In [653]:
weather_weeks6to17 = all_data[ (6 <= all_data['Week']) &  (all_data['Week']<= 17)]
weather_weeks6to17.shape

(1428, 16)