In [None]:
import pandas as pd
import df_structure
import pytz
from tqdm.auto import tqdm
tqdm.pandas()
import dask
import dask.dataframe as dd

year_start = 2016
year_end = 2020
path_to_fligt_csv = r"C:\ORIE4741 Data\2016_to_2020_flight_reduced_columns.csv"
path_to_weather_csv = r"C:\ORIE4741 Data\WeatherEvents_Jan2016-Dec2020.csv"

In [None]:
df_flight = pd.read_csv(path_to_fligt_csv, dtype = df_structure.column_dtypes)
df_weather = pd.read_csv(path_to_weather_csv)

In [None]:
res = pd.to_datetime(df_weather['StartTime(UTC)'])
df_weather['StartTime(UTC)'] = res.dt.tz_localize('UTC')
res = pd.to_datetime(df_weather['EndTime(UTC)'])
df_weather['EndTime(UTC)'] = res.dt.tz_localize('UTC')

In [None]:
df_weather['City, State'] = df_weather['City'] + ', ' + df_weather['State']

In [None]:
weather_by_city = {k: table for k, table in df_weather.groupby('City, State')}
del df_weather

In [None]:
list_of_airports = set(['ATL', 'DFW', 'DEN', 'ORD', 'LAX', 'CLT', 'LAS', 'PHX', 'MCO', 'SEA'])

In [None]:
df_flight = df_flight[df_flight['Origin'].isin(list_of_airports)]

In [None]:
len(df_flight.index)

In [None]:
df_flight[['FlightDate', 'CRSDepTime']].head()

In [None]:
df = df_flight

In [None]:
def cleanup_hhmm_time(row, col_name = 'CRSDepTime'):
    s = str(int(float(row[col_name])))
    res = None
    if len(s) == 4:
        res =  s
    elif len(s) == 3:
        res = '0' + s
    elif len(s) == 2:
        res = '00' + s
    elif len(s) == 1:
        res = '000' + s
    assert len(res) == 4
    return res

In [None]:
res = df.progress_apply(cleanup_hhmm_time, axis = 1)
df['CRSDepTime'] = res

In [None]:
def agg_datetime(row):
    res = str(row['FlightDate']) + '-' + str(row['CRSDepTime'])
    return res

In [None]:
res = df.progress_apply(agg_datetime, axis = 1)
df['DepTimeLocal'] = res

In [None]:
res = pd.to_datetime(df['DepTimeLocal'], format = '%Y-%m-%d-%H%M')
df['DepTimeLocal'] = res

In [None]:
def to_utc(row, col_name = 'DepTimeLocal', time_zone_indecator = 'OriginState'):
    local_time = row['DepTimeLocal']
    local_zone = us_states_timezones[row[time_zone_indecator]]
    UTC_time = local_time.tz_localize(local_zone, ambiguous=False).tz_convert("UTC")
    return UTC_time

In [None]:
us_states_timezones = df_structure.us_states_timezones

In [None]:
res = df.progress_apply(to_utc, axis = 1)
df['DepTimeUTC'] = res
df[['DepTimeLocal', 'OriginState', 'DepTimeUTC']].head()

In [None]:
weather_types = ['Severe-Cold', 'Fog', 'Hail', 'Rain', 'Snow', 'Storm', 'Other Precipitation']
weather_severity = [i + '_Severity' for i in weather_types]
weather_dtypes = {}
for i in weather_types:
    weather_dtypes[i] = 'boolean'
for i in weather_severity:
    weather_dtypes[i] = 'str'

In [None]:
df.to_csv('{}_to_{}_flight_reduced_columns_UTC.csv'.format(year_start, year_end), index = False)

In [None]:
def map_weather_flight(row, time_column_name = 'DepTimeUTC', loc_column_name = 'OriginCityName', column_suffix = '_Depart'):
    flight_utc = row[time_column_name]
    flight_loc = row[loc_column_name]
    if flight_loc in weather_by_city:
        weather_df = weather_by_city[flight_loc]
        affected_weathers = weather_df[(weather_df['StartTime(UTC)'] <= flight_utc) & (weather_df['EndTime(UTC)'] >= flight_utc)]
        if not affected_weathers.empty:
            hazards, severity = [], []
            for weather in weather_types:
                weather_specific_df = affected_weathers[affected_weathers['Type'] == weather]
                if weather_specific_df.empty:
                    hazards.append(False)
                    severity.append('')
                else:
                    hazards.append(True)
                    severity.append(weather_specific_df['Severity'].iloc[0])
            return hazards + severity
    return [False] * len(weather_types) + [''] * len(weather_types)


In [None]:
df[weather_types + weather_severity] = df.progress_apply(map_weather_flight, axis=1, result_type="expand")

In [None]:
df[df['Rain']].head()

In [None]:
df.to_csv('{}_to_{}_flight_reduced_columns_with_weather_top_10.csv'.format(year_start, year_end), index = False)