# Preprocess Flights Data

In [2]:
!pip3 install geopy --user rms

Collecting geopy
  Using cached https://files.pythonhosted.org/packages/80/93/d384479da0ead712bdaf697a8399c13a9a89bd856ada5a27d462fb45e47b/geopy-1.20.0-py2.py3-none-any.whl
Collecting geographiclib<2,>=1.49
  Using cached https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.20.0


In [3]:
import pandas as pd
import numpy as np
import math
import geopy.distance
import matplotlib.pyplot as plt

## Function Definitions

In [4]:
# create epoch time column in flights and weather data
def get_epoch(str):
    dt_obj = datetime.strptime(str,'%Y-%m-%d %H:%M')
    millisec = dt_obj.timestamp()
    millisec = millisec - (millisec % 3600)
    return millisec*1000

# add epoch time from departure time to each flight
def format_as_epoch_time(year, month, dayofMonth, deptime):
    time_str = str(int(deptime))
    time_str = time_str[:-2] + ':' + time_str[-2:]
    
    my_str = "{}-{}-{} {}".format(year, month, dayofMonth, time_str)
    try:
        return get_epoch(my_str)
    except:
        return 0

## Reading the Data

In [6]:
flights = pd.read_csv('../2008.csv')
display(flights.shape)
display(flights.head(2))

(7009728, 29)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,...,4.0,8.0,0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,...,5.0,10.0,0,,0,,,,,


## Data processing

In [7]:
print('dropping flights with missing date an time...')
flights = flights.dropna(axis=0, subset=['Year', 'Month', 'DayofMonth', 'CRSDepTime'])

airports_interest = ["DFW","ORD","DEN","LAX","LGA","RDU","DCA","BOS","SAN","SJC"]
print('selecting flights departing from ', airports_interest)
flights = flights.loc[flights['Origin'].isin(airports_interest)]

print('creating epoch time column...')
flights['epoch_time'] = [format_as_epoch_time(year, month, dayofMonth, depTime) for year, month, dayofMonth, depTime in zip(flights['Year'], flights['Month'], flights['DayofMonth'], flights['CRSDepTime'])] 

dropping flights with missing date an time...
selecting flights departing from  ['DFW', 'ORD', 'DEN', 'LAX', 'LGA', 'RDU', 'DCA', 'BOS', 'SAN', 'SJC']
creating epoch time column...


## Save New Data

In [7]:
print("writing to csv...")
flights.to_csv("./flights_2008_processed.csv")

In [8]:
display(flights.shape)
display(flights.head(2))

(244, 30)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,epoch_time
2212555,2008,4,11,5,1615.0,1622,1846.0,1828,NW,331,...,35.0,0,,0,0.0,0.0,18.0,0.0,0.0,0
2797120,2008,5,11,7,1056.0,1030,1154.0,1115,MQ,4234,...,28.0,0,,0,0.0,1.0,13.0,0.0,25.0,0
