# Module 1
## Data Preprocessing 

### Importing Modules

In [1]:
import pandas as pd
import numpy as np
import json
import sklearn.utils as skmu
from math import floor

### Data to be Considered

In [2]:
## Flights Data Pathnames
dirpath = "Data/Flight/"
dirname = "On_Time_On_Time_Performance_"
flightpathname = list()
for i in (2016,2017):
    i = str(i)
    for j in range(1,13):
        j = str(j)
        filename = dirname + i + "_" + j
        pathname = dirpath + i + "/" + filename + "/" + filename + ".csv"
        flightpathname.append(pathname)


## Weather Data
aircodes = ['ATL', 'CLT', 'DEN', 'DFW', 'EWR', 'IAH', 'JFK', 'LAS', 'LAX', 'MCO', 'MIA', 'ORD', 'PHX', 'SEA', 'SFO']

#Weather Factors to consider
weatherfactor = ['FlightDate', 'time', 'Origin', 'Dest', 'windspeedKmph', 'winddirDegree', 'precipMM', 'visibility', 'pressure', 'cloudcover', 'DewPointF', 'WindGustKmph', 'humidity']

### Processing Flight Data 

In [3]:
dataframes = list()
for i in flightpathname:   
    dataframe1 = pd.read_csv(i, encoding = 'utf-8', low_memory = False)
    dataframe1  = dataframe1[['FlightDate', 'Quarter' ,'Year' ,'Month' , 'DayofMonth', 'Origin', 'OriginAirportID','CRSDepTime', 'DepDelayMinutes', 'Dest', 'DestAirportID', 'CRSArrTime', 'ArrDelayMinutes','ArrDel15']]
    dataframe1['FlightDate'] = pd.to_datetime(dataframe1['FlightDate'])
    deptime = dataframe1['CRSDepTime'].to_list()
    arrtime = dataframe1['CRSArrTime'].to_list()
    deptime = [((floor(x/100)) * 100) for x in deptime]
    arrtime = [((floor(x/100)) * 100) for x in arrtime]
    dataframe1.loc[:,'CRSDepTime'] = deptime
    dataframe1.loc[:,'CRSArrTime'] = arrtime
    acdatf = list()
    for j in aircodes:
        dtf = dataframe1[(dataframe1.Origin == j)]
        dt = list()
        for k in aircodes:
            if k==j:
                continue
            else:        
                dtf1 = dtf[(dtf.Dest == k)]
                dt.append(dtf1)
        acdatf.append(pd.concat(dt))
    dataframe1 = pd.concat(acdatf)
    dataframes.append(dataframe1)

In [4]:
df = pd.concat(dataframes)
df.to_csv("Data/TotalFlightsData.csv", index = False, encoding = 'utf-8')

### Processing Weather Data

In [5]:
dirpath = "Data/weather/"
weatherdata = dict()
for i in weatherfactor:
    weatherdata[i] = list()
for k in aircodes:
    for i in (2016, 2017):
        i = str(i)
        for j in range(1,13):
            j = str(j)
            p = dirpath + k + "/" + i + "-" + j + ".json"
            data = json.load(open(p))['data']['weather']
            for d in data:
                date = d['date']
                d = d['hourly']
                for t in d:
                    weatherdata['Origin'].append(k)
                    weatherdata['Dest'].append(k)
                    weatherdata['FlightDate'].append(date)
                    for keys in weatherfactor:
                        try:
                            weatherdata[keys].append(t[keys])
                        except:
                            continue

In [6]:
wdataf = pd.DataFrame(weatherdata, columns = weatherfactor)
wdataf['FlightDate'] = pd.to_datetime(wdataf['FlightDate'])
wdataf["CRSDepTime"] = wdataf['time'].values
wdataf["CRSArrTime"] = wdataf['time'].values
wdataf.to_csv("Data/TotalWeatherData.csv", index = False, encoding = 'utf-8')

### Merging Weather and Flight Data

In [7]:
fdatf = pd.read_csv("Data/TotalFlightsData.csv", encoding = 'utf-8')
wdatf = pd.read_csv("Data/TotalWeatherData.csv", encoding = 'utf-8')

In [8]:
wdatf1 = wdatf[['FlightDate', 'CRSDepTime', 'Origin', 'windspeedKmph', 'winddirDegree', 'precipMM', 'visibility', 'pressure', 'cloudcover', 'DewPointF', 'WindGustKmph', 'humidity']]
wdatf2 = wdatf[['FlightDate', 'CRSArrTime', 'Dest', 'windspeedKmph', 'winddirDegree', 'precipMM', 'visibility', 'pressure', 'cloudcover', 'DewPointF', 'WindGustKmph', 'humidity']]

In [9]:
#Origin weather
dataframe1 = pd.merge(fdatf, wdatf1, on=['FlightDate', 'Origin', 'CRSDepTime'])
dataframe1.rename(columns = {'FlightDate':'FlightDate', 'Quarter':'Quarter', 'Year': 'Year','Month':'Month','DayofMonth':'DayofMonth','Origin':'Origin','OriginAirportID':'OriginAirportID','CRSDepTime':'CRSDepTime','DepDelayMinutes':'DepDelayMinutes','Dest':'Dest','DestAirportID':'DestAirportID', 'CRSArrTime':'CRSArrTime', 'ArrDelayMinutes':'ArrDelayMinutes', 'windspeedKmph': 'DepwindspeedKmph', 'winddirDegree': 'DepwinddirDegree', 'precipMM': 'DepprecipMM','visibility': 'Depvisibility', 'pressure': 'Deppressure','cloudcover': 'Depcloudcover','DewPointF': 'DepDewPointF','WindGustKmph': 'DepWindGustKmph','humidity': 'Dephumidity'})
finaldatf = pd.merge(dataframe1, wdatf2, on=['FlightDate', 'Dest', 'CRSArrTime'] )

In [10]:
print(len(finaldatf.index))
finaldatf.dropna(inplace = True)

1877281


### Generating Training and Test Datasets

In [11]:
finaldatf = skmu.shuffle(finaldatf).reset_index(drop=True)

In [12]:
finaldatf = finaldatf[['FlightDate', 'Quarter' ,'Year' ,'Month' , 'DayofMonth' ,'OriginAirportID','CRSDepTime', 'DepDelayMinutes', 'DestAirportID','CRSArrTime', 'ArrDelayMinutes', 'ArrDel15', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']]
trainingdatf = finaldatf.iloc[0:1800000]
testdatf = finaldatf.iloc[1800000:]

In [13]:
trainingdatf.reset_index(drop = True)
testdatf.reset_index(drop = True)
trainingdatf.to_csv("Data/TrainingData.csv", index = False, encoding = "utf-8")
testdatf.to_csv("Data/TestData.csv", index = False, encoding = "utf-8")