In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
delay_features = [
    'DayOfWeek',
    'FlightDate',
    'IATA_CODE_Reporting_Airline',
    'Tail_Number',
    'Flight_Number_Reporting_Airline',
    'OriginAirportID',
    'Origin',
    'OriginState',
    'DestAirportID',
    'Dest',
    'DestState',
    'CRSDepTime',
    'DepTime',
    'DepDelayMinutes',
    'CRSArrTime',
    'ArrTime',
    'ArrDelayMinutes',
    'CRSElapsedTime',
    'ActualElapsedTime',
    'AirTime',
    'Cancelled',
    'CancellationCode',
    'Diverted',
    'CarrierDelay',
    'WeatherDelay',
    'NASDelay',
    'SecurityDelay',
    'LateAircraftDelay',
]

dtypes = {
    'DayOfWeek':                       int,
    'FlightDate':                      'string',
    'IATA_CODE_Reporting_Airline':     'string',
    'Tail_Number':                     'string',
    'Flight_Number_Reporting_Airline': int,
    'OriginAirportID':                 int,
    'Origin':                          'string',
    'OriginState':                     'string',
    'DestAirportID':                   int,
    'Dest':                            'string',
    'DestState':                       'string',
    'CRSDepTime':                      int,
    'DepTime':                         np.float64,
    'DepDelayMinutes':                 np.float64,
    'CRSArrTime':                      int,
    'ArrTime':                         np.float64,
    'ArrDelayMinutes':                 np.float64,
    'CRSElapsedTime':                  np.float64,
    'ActualElapsedTime':               np.float64,
    'AirTime':                         np.float64,
    'Cancelled':                       int,
    'CancellationCode':                'string',
    'Diverted':                        int,
    'CarrierDelay':                    np.float64,
    'WeatherDelay':                    np.float64,
    'NASDelay':                        np.float64,
    'SecurityDelay':                   np.float64,
    'LateAircraftDelay':               np.float64
}

In [3]:
years = ['2017', '2018', '2019']
months = [f'{n:02}' for n in range(1, 12 + 1)]

df_all_months = (pd.read_csv(f'data/airline-performance/individual/aperf-{year}-{month}.csv', usecols=delay_features, dtype=dtypes)
                 for month in months for year in years)
delay_df = pd.concat(df_all_months, ignore_index=True)
delay_df

Unnamed: 0,DayOfWeek,FlightDate,IATA_CODE_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,Origin,OriginState,DestAirportID,Dest,...,CancellationCode,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,7,2017-01-01,AA,N787AA,1,12478,JFK,NY,12892,LAX,...,,0,402.0,398.0,347.0,27.0,0.0,0.0,0.0,0.0
1,1,2017-01-02,AA,N788AA,1,12478,JFK,NY,12892,LAX,...,,0,402.0,417.0,362.0,,,,,
2,2,2017-01-03,AA,N783AA,1,12478,JFK,NY,12892,LAX,...,,0,402.0,384.0,354.0,,,,,
3,3,2017-01-04,AA,N799AA,1,12478,JFK,NY,12892,LAX,...,,0,402.0,395.0,360.0,,,,,
4,4,2017-01-05,AA,N788AA,1,12478,JFK,NY,12892,LAX,...,,0,402.0,408.0,387.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20302848,7,2019-12-01,DL,N346DN,830,10397,ATL,GA,12953,LGA,...,,0,122.0,110.0,93.0,,,,,
20302849,7,2019-12-01,DL,N697DL,831,13204,MCO,FL,13487,MSP,...,,0,209.0,206.0,191.0,0.0,0.0,0.0,0.0,23.0
20302850,7,2019-12-01,DL,N697DL,831,13487,MSP,MN,13204,MCO,...,,0,200.0,237.0,157.0,0.0,0.0,36.0,0.0,0.0
20302851,7,2019-12-01,DL,N6714Q,832,11278,DCA,VA,14869,SLC,...,,0,293.0,303.0,281.0,,,,,


In [4]:
airportCoordMap = {}

with open('latitude_longitude.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            airportCoordMap[row[0]] = (float(row[1]), float(row[2]))
            line_count += 1
    print(f'Processed {line_count} lines.')

print(airportCoordMap)

Column names are iata_code, latitude_deg, longitude_deg, elevation_m
Processed 366 lines.
{'JFK': (40.639447, -73.779317), 'LAX': (33.942501, -118.407997), 'DFW': (32.896801, -97.038002), 'OKC': (35.393101, -97.6007), 'OGG': (20.898543, -156.431212), 'HNL': (21.32062, -157.924228), 'SFO': (37.61899948, -122.375), 'MIA': (25.79319954, -80.29060364), 'ORD': (41.9786, -87.9048), 'IAH': (29.9843998, -95.34140015), 'BOS': (42.3643, -71.005203), 'DTW': (42.21239853, -83.35340118), 'SEA': (47.449162, -122.311134), 'MSP': (44.882, -93.221802), 'STL': (38.748697, -90.370003), 'MCO': (28.42939949, -81.30899811), 'KOA': (19.738783, -156.045603), 'LAS': (36.083361, -115.151817), 'MEM': (35.04240036, -89.97669983), 'PDX': (45.58869934, -122.5979996), 'DCA': (38.8521, -77.037697), 'SAN': (32.73360062, -117.1900024), 'TUS': (32.115004, -110.938053), 'CLT': (35.2140007, -80.94309998), 'SJC': (37.362452, -121.929188), 'PHX': (33.435302, -112.005905), 'SNA': (33.675701, -117.867996), 'LGA': (40.777199, 

In [5]:
origin_airports = delay_df['Origin'].unique().tolist()
dest_airports = delay_df['Dest'].unique().tolist()
airports = list(set(origin_airports + dest_airports))

airports_to_remove = ['USA']
airports = [x for x in airports if x not in airports_to_remove]
missing_airports = [airport for airport in airports if airport not in airportCoordMap]

print("Number of airports: ", len(airports))
print(missing_airports)

Number of airports:  365
[]
