In [1]:
import math
from decimal import *
import numpy as np # linear algebra
import pandas as pd # read_csv and such
from geopy.distance import vincenty # calculate distance between locations
from io import StringIO # convert strings to buffers or something like that.
import os # for listing files in directory

### File paths for data sets

In [2]:
# the paths to our flight related data sets
airlines_path = '../data/flight_delays_2015/airlines.csv'
airports_path = '../data/flight_delays_2015/airports.csv'
flights_path  = '../data/flight_delays_2015/flights.csv'
# weather related paths
weather_path = '../data/weather/2015.csv'
stations_path  = '../data/weather/stations.csv'
weather_files_path = '../data/weather/station_data/'
weather_merged_path = '../data/weather/weather_merged.csv'

# modified data sets for output
airports_modified_path = '../data/flight_delays_2015/airports_modified.csv'

### Read in the data sets

In [7]:
# read the airlines in and get a dataframe containing the dataset
# Set low_memory to False so that types can be inferred/converted
airlines_df = pd.read_csv(airlines_path, low_memory=False) 
airlines_df.head()

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways


In [8]:
airports_df = pd.read_csv(airports_path, low_memory=False)
airports_df.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [9]:
flights_df = pd.read_csv(flights_path, low_memory=False)
flights_df.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [10]:
stations_df = pd.read_csv(stations_path, low_memory=False)
stations_df.head()

Unnamed: 0,usaf,wban,name,country,state,call,lat,lon,elev,begin,end
0,7011,99999,CWOS 07011,,,,,,,20120101,20121129
1,7025,99999,CWOS 07025,,,,,,,20120127,20120127
2,7034,99999,CWOS 07034,,,,,,,20121024,20121106
3,7047,99999,CWOS 07047,,,,,,,20120613,20120717
4,7059,99999,CWOS 07059,,,,,,,20120314,20120828


### Remove stations that are not in the US or are missing latitude/longitude
Since we know all our Airports are in the US, this will vastly improve performance on calculations

In [None]:
stations_df = stations_df[np.isfinite(stations_df['lat'])]
stations_df = stations_df[np.isfinite(stations_df['lon'])]
stations_df = stations_df[stations_df['country'] == 'US']
stations_df.head()

### Assign each airport primary and secondary weather stations
Choose two closest weather stations to assign to each airport

In [None]:
for index, row in airports_df.iterrows():
    first = stations_df.iloc[0]
    first_dist = vincenty((row['LATITUDE'], row['LONGITUDE']), (first['lat'], first['lon'])).miles
    second = stations_df.iloc[1]
    second_dist = vincenty((row['LATITUDE'], row['LONGITUDE']), (second['lat'], second['lon'])).miles
    if second_dist < first_dist:
        first, first_dist, second, second_dist = second, second_dist, first, first_dist
    for i in range(2, len(stations_df['usaf'])):
        dist = vincenty((row['LATITUDE'], row['LONGITUDE']), (stations_df.iloc[i]['lat'], stations_df.iloc[i]['lon'])).miles
        if dist < second_dist:
            if dist < first_dist:
                first, first_dist, second, second_dist = stations_df.iloc[i], dist, first, first_dist
            else:
                second, second_dist = stations_df.iloc[i], dist
    airports_df.loc[index, 'primary_station_usaf'], airports_df.loc[index, 'primary_station_wban'], airports_df.loc[index, 'secondary_station_usaf'], airports_df.loc[index, 'secondary_station_wban'] = first['usaf'], first['wban'], second['usaf'], second['wban']
airports_df.head()

### Read in weather data for all the stations
Load all the station files and combine into a single dataframe

In [5]:
files = os.listdir(weather_files_path)
weather_df = pd.read_fwf(weather_files_path + files[0])

for i in  range(1, len(files)):
    temp = pd.read_fwf(weather_files_path + files[i])
    weather_df = pd.concat([weather_df, temp])

weather_df.head()

Unnamed: 0.1,Unnamed: 0,STN---,WBAN,YEARMODA,TEMP,Unnamed: 4,DEWP,Unnamed: 6,SLP,Unnamed: 8,...,Unnamed: 12,WDSP,Unnamed: 14,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,0,483280,99999,20150101,71.7,13,60.0,13,1018.2,8,...,13,1.5,13,6.0,999.9,84.2,59.9*,0.00I,999.9,0
1,1,483280,99999,20150102,71.2,15,57.1,15,1017.8,8,...,10,1.2,15,5.1,999.9,85.1,53.6*,0.00I,999.9,0
2,2,483280,99999,20150103,71.1,15,57.6,15,1016.7,8,...,15,0.7,15,4.1,999.9,86.0,53.6*,0.00I,999.9,0
3,3,483280,99999,20150104,72.9,14,59.7,14,1013.6,8,...,14,0.5,14,4.1,999.9,90.3,55.4*,0.00I,999.9,0
4,4,483280,99999,20150105,76.5,15,65.1,15,1010.4,8,...,11,0.5,15,2.9,999.9,91.6,59.0*,0.00I,999.9,0


### Save Dataframe
Save to a file to keep from having to run again.

In [14]:
weather_df.to_csv(weather_merged_path, sep=',')