# FDS Project - Predicting Flights Delays using Flight and Weather Data

## Preliminaries 

In [92]:
import pandas as pd
import numpy as np
import requests 
import html5lib
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from bs4 import BeautifulSoup 
import os
import re 
from datetime import datetime, timedelta
import csv
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Constructing the Dataset  



### Note: the code in this section uses local file paths in order to construct the dataset. Please do not run locally if you wish to avoid using local disk space on your machine and/or a lengthy run time. The code for cleaning the dataset is located in [Section 5](#hashtag_suffix)), and the analysis is found in the accompanying R-script. 

First I acquire flight data from...on...

In [2]:
#set seed 
np.random.seed(666)

In [255]:
years = [2016, 2017, 2018]
months = [1, 3, 9,  12]

for year in years:
    for month in months:
        zipref = ZipFile("C:\\Users\\Ollie\\Downloads\\On_Time_Reporting_Carrier_On_Time_Performance_1987_present_" + f"{year}_"+f"{month}.zip","r") 
        zipref.extractall("C:\\Users\\Ollie\\Desktop\\BSE\\Courses\\Foundations of Data Science\\Project\\Data")
        print(f'{year}{month} Completed')

20161 Completed
20163 Completed
20169 Completed
201612 Completed
20171 Completed
20173 Completed
20179 Completed
201712 Completed
20181 Completed
20183 Completed
20189 Completed
201812 Completed


In [256]:
years = [2016, 2017, 2018]
months = [1, 3, 9,  12]

full_dfs=[]

for year in years:
    for month in months:
        df = pd.read_csv("C:\\Users\\Ollie\\Desktop\\BSE\\Courses\\Foundations of Data Science\\Project\\Data\\On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_" + f"{year}_"+f"{month}"+".csv", dtype=object) 
        full_dfs.append(df)
        print(f'{year}{month} Completed')

20161 Completed
20163 Completed
20169 Completed
201612 Completed
20171 Completed
20173 Completed
20179 Completed
201712 Completed
20181 Completed
20183 Completed
20189 Completed
201812 Completed


In [298]:
rs_dfs = []

for dfr in full_dfs:
    rs_df = dfr.sample(frac=0.0015)
    rs_dfs.append(rs_df)
    

In [299]:
df  = pd.concat(rs_dfs)

In [300]:
keep = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate', 'DOT_ID_Reporting_Airline', \
        'IATA_CODE_Reporting_Airline', 'OriginAirportID', 'Origin', 'OriginCityName', 'OriginStateName', \
       'OriginWac', 'DestAirportID', 'Dest', 'DestCityName', 'DestStateName', 'DestWac', 'CRSDepTime', \
       'DepDelay', 'DepDel15', 'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOut', 'CRSArrTime', 'ArrDelay', \
       'ArrDel15', 'ArrivalDelayGroups', 'ArrTimeBlk', 'Cancelled', 'CancellationCode', 'Flights', \
        'Distance', 'DistanceGroup', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', \
        'LateAircraftDelay']

In [301]:
df_flght = df[keep]

In [302]:
df_flght.to_csv('C:\\Users\\Ollie\\Desktop\\BSE\\Courses\\Foundations of Data Science\\Project\\Code\\FDS-project\\Flight_Data.csv', index=False)

In [2]:
df_flght= pd.read_csv('C:\\Users\\Ollie\\Desktop\\BSE\\Courses\\Foundations of Data Science\\Project\\Code\\FDS-project\\Flight_Data.csv')

In [3]:
df_flght.shape


(8777, 38)

### Download weather data from weather underground corresponding to the IATA airport code and flight date. 

In [5]:
pd.to_datetime(df_flght['FlightDate'], format = '%Y-%m-%d')

0      2016-01-11
1      2016-01-21
2      2016-01-27
3      2016-01-28
4      2016-01-25
          ...    
8772   2018-12-20
8773   2018-12-19
8774   2018-12-04
8775   2018-12-29
8776   2018-12-29
Name: FlightDate, Length: 8777, dtype: datetime64[ns]

In [4]:
### Scrape historical weather data from Weather Underground. 

def scrape_station(station, date):
    # Use .format(station, YYYY, M, D)
    URL = 'http://www.wunderground.com/history/daily/{}/date/{}-{}-{}.html'
    
    #Format date components.
    date_year = int(date[0:4])
    date_month = int(date[5:7])
    date_day = int(date[8:10])
    
    #format URl
    formatted_URL = URL.format(station, date_year, date_month, date_day)
    
    #Scrape weather data for specified station on specified date. 
    driver = webdriver.Chrome()
    driver.get(formatted_URL)
    driver.minimize_window()
    tables = WebDriverWait(driver,5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table")))
    #Table 0 contains the data that we want. Hourly data is available in the second table.
    newTable = pd.read_html(tables[0].get_attribute('outerHTML'))
    return(newTable[0])

In [8]:
airports = set(list(df_flght['Origin']))
gr_flght=df_flght.groupby(by='Origin')

for airport in airports:
    df_airport_dates = []
    dates = set(list(gr_flght.get_group(airport)['FlightDate']))
    airport_code = 'K' + airport
    for date in dates:
        try:
            df_airport_date = scrape_station(airport_code, date)
            df_airport_date['FlightDate'] = np.tile(date, len(df_airport_date))
            if df_airport_date.shape[1] == 6:
                df_airport_date.columns = ["Stats", "1", "2", "3", "4", "FlightDate"]
            elif df_airport_date.shape[1] == 11:
                df_airport_date.columns = ["Stats", "1", "2", "3", "4", "5", "6", "7", "8", "9", "FlightDate"]
            df_airport_date.reset_index(drop=True)
            df_airport_date = df_airport_date.pivot(index=["FlightDate"], columns=["Stats"], values =["1"])
            df_airport_date['Origin'] = np.tile(airport, len(df_airport_date))
            print(f'{airport}-{date}: done')
            df_airport_dates.append(df_airport_date)
        except Exception as e:
            print(f'{airport}-{date} error: {e}')
            continue
    try:
        df_airport_dates = pd.concat(df_airport_dates)
    except Exception as e:
        print(f'error:{e}')
        continue
    df_airport_dates.to_csv(f'{airport}_data.csv', index=True)
    print(f'{airport}: complete')

MCI-2017-12-29: done
MCI-2018-09-07: done
MCI-2017-01-23: done
MCI-2016-03-30: done
MCI-2018-01-23: done
MCI-2016-12-26: done
MCI-2018-01-16: done
MCI-2018-12-21: done
MCI-2016-01-25: done
MCI-2018-09-03: done
MCI-2018-09-16: done
MCI-2018-09-04: done
MCI-2018-09-10: done
MCI-2018-09-17: done
MCI-2016-09-20: done
MCI-2017-03-23: done
MCI-2018-03-24: done
MCI-2018-01-24: done
MCI-2018-12-08 error: Message: 

MCI-2017-03-10: done
MCI-2018-12-20: done
MCI-2017-03-06: done
MCI-2018-01-29: done
MCI-2016-03-13: done
MCI-2017-03-08: done
MCI-2018-01-30: done
MCI-2016-09-30: done
MCI-2017-12-21: done
MCI-2017-01-18: done
MCI-2018-01-18: done
MCI-2016-09-07: done
MCI-2016-03-27: done
MCI-2017-09-24: done
MCI-2018-12-01 error: Index contains duplicate entries, cannot reshape
MCI-2016-09-28: done
MCI-2018-09-11: done
MCI-2017-12-08: done
MCI-2017-12-19: done
MCI-2016-03-15: done
MCI-2016-12-13: done
MCI-2018-09-23: done
MCI-2016-03-16: done
MCI-2018-03-18: done
MCI-2018-03-04: done
MCI-2017-09-20

TPA-2018-03-20: done
TPA-2017-12-18: done
TPA-2018-12-27: done
TPA-2016-03-17: done
TPA-2017-01-09: done
TPA-2018-12-06: done
TPA-2018-03-25: done
TPA-2016-12-26: done
TPA-2017-01-31: done
TPA-2017-01-19: done
TPA-2018-03-29: done
TPA-2017-03-20: done
TPA-2018-12-10: done
TPA-2016-12-06 error: Index contains duplicate entries, cannot reshape
TPA-2016-03-13: done
TPA-2017-03-25: done
TPA-2018-09-09: done
TPA-2016-12-02: done
TPA-2017-12-17: done
TPA-2016-09-22: done
TPA-2016-03-18: done
TPA-2018-03-03: done
TPA-2017-09-26: done
TPA-2017-12-06: done
TPA-2018-12-24: done
TPA-2016-09-09: done
TPA-2018-09-29: done
TPA-2018-09-28: done
TPA-2018-09-10: done
TPA-2018-01-14: done
TPA-2016-03-29: done
TPA-2016-01-06: done
TPA-2017-12-28: done
TPA-2017-09-09: done
TPA-2017-12-16: done
TPA-2016-09-07: done
TPA-2018-03-21: done
TPA-2016-01-26: done
TPA-2016-12-20: done
TPA-2018-09-08: done
TPA-2018-09-01: done
TPA-2016-01-31: done
TPA-2016-03-11: done
TPA-2017-03-07: done
TPA-2018-01-15: done
TPA-2

ORD-2017-03-16: done
ORD-2016-01-18: done
ORD-2018-03-19: done
ORD-2016-12-28: done
ORD-2017-03-11: done
ORD-2016-03-19: done
ORD-2017-01-10: done
ORD-2016-12-12: done
ORD-2017-09-07: done
ORD-2016-03-03: done
ORD-2016-03-21: done
ORD-2016-09-21: done
ORD-2016-03-07: done
ORD-2017-01-11: done
ORD-2017-03-14: done
ORD-2017-12-09: done
ORD-2018-12-15: done
ORD: complete
BIS-2018-01-29: done
BIS-2016-01-21: done
BIS-2018-09-13: done
BIS-2017-09-19: done
BIS: complete
AVL-2018-09-14: done
AVL-2017-12-02: done
AVL-2017-09-19: done
AVL-2017-09-06: done
AVL: complete
IDA-2018-01-05: done
IDA-2016-01-24: done
IDA-2017-03-08: done
IDA-2017-12-23: done
IDA: complete
GSP-2016-12-01: done
GSP-2018-03-27: done
GSP-2016-12-17: done
GSP-2018-03-17: done
GSP-2017-03-02: done
GSP-2018-03-15: done
GSP-2016-03-31: done
GSP-2018-09-26: done
GSP: complete
PIA-2016-09-26: done
PIA-2016-03-28: done
PIA-2017-03-04: done
PIA-2018-12-17: done
PIA: complete
MFE-2017-09-02: done
MFE-2018-01-12: done
MFE-2016-01-1

In [9]:
# Now figure out how to merge these into one dataset with column name Origin with airport number, then merge
# on Origin and Date, then save as a CSV. 


In [81]:
#Keep relevant data. 
keep = ['FlightDate', 'Day Average Temp', 'High Temp', 'Low Temp', 'Max Wind Speed', 'Precipitation', 'Sea Level Pressure', \
        'Origin']

#Get relevant airport codes from local directory.
directory_list = os.listdir()
airports = []
for i in directory_list:
    if i[-9:] == '_data.csv': 
       airports.append(i[:-9])

store_data = []

for airport in airports: 
    airport_df = pd.read_csv(f'C:\\Users\\Ollie\\Desktop\\BSE\\Courses\\Foundations of Data Science\\Project\\Code\\FDS-project\\{airport}_data.csv', 
                     header = 1)
    airport_df.rename(columns = {'Stats': 'FlightDate', airport_df.columns[airport_df.columns.str.contains('Unnamed')][0]: 'Origin', \
               airport_df.columns[airport_df.columns.str.contains('Precipitation')][-1]: 'Precipitation'}, inplace = True)
    final_airport_df = airport_df[keep]
    final_airport_df = final_airport_df.iloc[1:]
    store_data.append(final_airport_df)


In [86]:
#Concatenate dataframes and reset index.
airports_weather = pd.concat(store_data, sort=False)
airports_weather.reset_index(drop=True, inplace=True )


In [None]:
#Merge the two dataframes on date and origin.

final_df = reduce(lambda  left,right: pd.merge(left,right,on=['column_name'],
                                            how='outer'), dfs)




In [None]:
#if airports don't return on Weather Undeground make a list of airport names that don't return.
# get airport names from here: https://github.com/datasets/airport-codes/tree/master/data. 

#They will return, I just need to add a K to the start of each string to identify it as an airport code.

In [None]:


## Cleaning the Dataset

## Cleaning the Dataset