<a href="https://colab.research.google.com/github/Nolanole/NFL-Weather-Project/blob/master/Wrangle_Clean_Weather_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np

In [0]:
url = 'https://raw.githubusercontent.com/Nolanole/NFL-Weather-Project/master/weather_raw.csv'
df = pd.read_csv('https://raw.githubusercontent.com/Nolanole/NFL-Weather-Project/master/weather_raw.csv', na_values='--')

In [0]:
#clean the venue column and rename to stadium
df['stadium'] = df['venue'].apply(lambda x: x.split('Venue: ')[1])
df = df.drop(columns=['Unnamed: 0', 'venue'])

#convert date to datetime obj:
df['date'] = pd.to_datetime(df['date'])

#make column for fog, drop fog from weather/precipitation
df['fog_or_haze'] = df['Weather'].str.contains('fog|haze')
df['fog_or_haze'] = df['fog_or_haze'].fillna(value=False)

In [0]:
#categorize skycover column

sky_mapper = {'overcast':'overcast/mostly cloudy', 'mostly cloudy':'overcast/mostly cloudy', 
              'scattered clouds':'clear/cloudy mix', 'half cloudy':'clear/cloudy mix',
              'cloudy':'overcast/mostly cloudy', 'partly cloudy':'clear/cloudy mix', 
              'mostly clear':'clear', 'more than half cloudy':'overcast/mostly cloudy',
              'obscured':'clear/cloudy mix', 'haze':'clear/cloudy mix', 
              'Mostly cloudy':'overcast/mostly cloudy', 'mist':'clear/cloudy mix'}

df['Skycover'] = df['Skycover'].replace(sky_mapper)
df = df.rename(columns={'Skycover': 'sky'})

In [0]:
#categorize weather column to be precipitation

df.Weather.value_counts().index
weather_mapper = {'fog':np.nan, 'drizzle': 'light rain', 'mist':'light rain',
                  'light drizzle':'light rain', 'rain':'rain/storm', 'thstrm, rain':'rain/storm', 
                  'heavy rain':'rain/storm', 'smoke haze':np.nan, 'thstrm':'rain/storm', 
                  'ice pellets':'light snow', 'light freezing rain':'light rain',
                  'light freezing drizzle':'light rain', 'blowing snow':'snow', 'heavy snow':'snow', 
                  'snow flurries':'snow', 'fog, light snow':'light snow', 'fog, snow':'snow', 
                  'heavy fog':np.nan, 'snow grains':'light snow', 'light thstrm, rain':'light rain', 
                  'fog, rain, thstrm':'rain/storm', 'thstrm, hail':'rain/storm', 'lightning':np.nan,
                  'light rain, snow':'light rain', 'light rain, lightning':'light rain', 
                  'lightning, thstrm':'rain/storm','light snow, fog':'light snow', 
                  'light rain, ice pellets':'light rain', 'haze':np.nan}

df['Weather'] = df['Weather'].replace(weather_mapper)
df = df.rename(columns={'Weather': 'precipitation'})

In [0]:
def get_chunk(start,stop):
  cols_to_keep = ['away', 'home', 'place', 'date', 'stadium', 'fog_or_haze']
  games = []
  for i in range(start, stop, 4):
    game = df.iloc[i:i+4]
    game_mean_temp = game['Temp'].mean()
    game_mean_dewpoint = game['Dewpoint'].mean()
    game_mean_humidity = game['Humidity'].mean()
    game_mean_wind = game['Wind Speed'].mean()
    game_max_windgust = game['Wind Gust'].max()
    game_min_windchill = game['Windchill'].min()
    game_min_windchillgust = game['Windchill Gust'].min()
    game_fog_or_haze = game['fog_or_haze'].value_counts().index[0]
  
    try:
      game_sky = game['sky'].value_counts().index[0]
    except:
      game_sky = np.nan
  
    #only count precipitation if lasted for more than 1 quarter
    try:
      if game['precipitation'].value_counts()[0] > 1:
        game_precipitation = game['precipitation'].value_counts().index[0]
      else:
        game_precipitation = np.nan
    except:
      game_precipitation = np.nan
    
    game_df = game[cols_to_keep]
    game_df['avg_temp'] = game_mean_temp
    game_df['avg_dewpoint'] = game_mean_dewpoint
    game_df['avg_humidity'] = game_mean_humidity
    game_df['avg_wind'] = game_mean_wind
    game_df['max_windgust'] = game_max_windgust
    game_df['windchill'] = game_min_windchill
    game_df['windchill_gust'] = game_min_windchillgust
    game_df['sky'] = game_sky
    game_df['precipitation'] = game_precipitation
    game_df['fog_or_haze'] = game_fog_or_haze
    games.append(game_df.iloc[0,:])
  return games



In [0]:
games1 = get_chunk(0,7500)

In [0]:
chunk1 = pd.concat(games1, axis=1).T.reset_index().drop(columns=['index'])

In [0]:
games2 = get_chunk(7500, 15000)

In [0]:
chunk2 = pd.concat(games2, axis=1).T.reset_index().drop(columns=['index'])

In [0]:
games3 = get_chunk(15000, 22500)

In [0]:
chunk3 = pd.concat(games3, axis=1).T.reset_index().drop(columns=['index'])

In [0]:
games4 = get_chunk(22500, 29596)

In [0]:
chunk4 = pd.concat(games4, axis=1).T.reset_index().drop(columns=['index'])

In [0]:
combined = pd.concat([chunk1, chunk2, chunk3, chunk4])

In [0]:
#covert date column to datetime obj:  
combined['date'] = pd.to_datetime(combined['date'])

In [0]:
#convert fog_or_haze to 0/1 instead of true/false:
combined['fog_or_haze'] = combined['fog_or_haze'].replace({True:1, False:0}) 

In [0]:
#convert numeric columns to floats:
numeric_cols = ['avg_temp', 'avg_dewpoint', 'avg_humidity', 'avg_wind', 'max_windgust', 'windchill', 'windchill_gust']
combined[numeric_cols] = combined[numeric_cols].astype('float64')

In [0]:
#categoricals: precipitation (first convert Nan to 'None', sky, 
combined['precipitation'] = combined['precipitation'].fillna(value='None').astype('category')
combined['sky'] = combined['sky'].astype('category')

In [0]:
#cleans some nulls:
combined['avg_wind'] = combined['avg_wind'].fillna(0)

In [0]:
#reset the index:
combined = combined.reset_index(drop=True)

In [0]:
#export and download the csv:
combined.to_csv('weather_cleaned.csv')

#download the csv:
from google.colab import files
files.download('weather_cleaned.csv')