<a href="https://colab.research.google.com/github/Nolanole/NFL-Weather-Project/blob/master/Notebook_3_Finish_Cleaning_and_Merge_Games_DFs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [0]:
#First get the all_games data into csv and clean:
all_games_url = 'https://raw.githubusercontent.com/Nolanole/NFL-Weather-Project/master/all_games.csv'
all_games = pd.read_csv(all_games_url)

In [0]:
#drop season year col:
all_games = all_games.drop(columns=['season_year', 'stadium', 'week', 'schedule_playoff'])

In [0]:
#first, convert date column to datetime obj, then remove games after 2018 super bowl to match the season from weather df
all_games['date'] = pd.to_datetime(all_games['date'], infer_datetime_format=True)
all_games = all_games.iloc[:9410]

In [0]:
#add gameid column- since this is all games, can just use the index + 1
all_games['game_id'] = all_games.index + 1

In [0]:
#Next get the weather csv and finish cleaning:
weather_games_url = 'https://raw.githubusercontent.com/Nolanole/NFL-Weather-Project/master/weather_cleaned.csv'
weather_games = pd.read_csv(weather_games_url, usecols=range(1,16))

In [0]:
#convert date column to datetime obj:  
weather_games['date'] = pd.to_datetime(weather_games['date'], infer_datetime_format=True)

In [0]:
#categoricals: precipitation (first convert Nan to 'None', sky, 
weather_games['precipitation'] = weather_games['precipitation'].astype('category')
weather_games['sky'] = weather_games['sky'].astype('category')

In [0]:
#clean some incorrect data:
weather_games.at[6499, 'date'] = '1985-12-29'
weather_games.at[2484, 'home'] = 'New York Giants'
weather_games.at[2484, 'away'] = 'New Orleans Saints'
weather_games.at[618, 'home'] = 'Denver Broncos'
weather_games.at[618, 'away'] = 'Carolina Panthers'
weather_games.at[2457, 'home'] = 'Indianapolis Colts'
weather_games.at[2457, 'away'] = 'Chicago Bears'
weather_games.at[2890, 'home'] = 'Philadelphia Eagles'
weather_games.at[2890, 'away'] = 'New England Patriots'
weather_games.at[5965, 'home'] = 'San Francisco 49ers'
weather_games.at[5965, 'away'] = 'Cincinnati Bengals'
weather_games.at[6882, 'home'] = 'Washington Redskins'
weather_games.at[6882, 'away'] = 'Los Angeles Raiders'

In [0]:
#merge all games and weather games:
merged = pd.merge(all_games, weather_games, how='outer', on=['date', 'home', 'away'])

In [0]:
#rows 9419-9452 were missing from all_games csv, so dont have a game_id:
new_id = 9411
for i in range(9419,9453,1):
  merged.at[i, 'game_id'] = new_id
  new_id += 1

In [0]:
#duplicates to drop and a few others: 
dup_indices = [1814, 6463, 1809, 5254, 590, 9415, 9416, 9417, 9418, 9453, 9454]
merged = merged.drop(dup_indices)
#missing weather data:
missing_weather_rows = [1539, 1807,232, 1805, 6858, 6859, 6940, 7188, 7398,7650,8025,8259,8521]
merged = merged.drop(missing_weather_rows)

In [0]:
#recast game_id as int
merged['game_id'] = merged['game_id'].astype('int64')

In [0]:
#find the 7 where sky is null but stadium isnt
overcast = [466,2481,2542,2588]
clear = [7996,8091,8727]
for i in overcast:
  merged.at[i, 'sky'] = 'overcast/mostly cloudy'
for i in clear:
  merged.at[i, 'sky'] = 'clear'

In [0]:
#Fill NaNs for indoor stadium
merged['stadium'] = merged['stadium'].fillna('dome')

In [0]:
#fill weather data for indoor stadiums:
dome_indices = merged[merged.stadium == 'dome'].index
for i in dome_indices:
  merged.at[i, 'avg_temp'] = 70
  merged.at[i, 'avg_humidity'] = 40
  merged.at[i, 'precipitation'] = 'None'
  merged.at[i, 'sky'] = 'clear'
  merged.at[i, 'fog_or_haze'] = 0
  merged.at[i, 'avg_wind'] = 0
  merged.at[i, 'max_windgust'] = 0
  

In [0]:
#drop place column:
merged = merged.drop(columns='place')

In [0]:
#reset_index
merged = merged.reset_index(drop=True)

In [0]:
#save and export:
merged.to_csv('all_games_weather.csv')

#download the csv:
from google.colab import files
files.download('all_games_weather.csv')