In [None]:
import pandas as pd
from sqlalchemy import create_engine
import collections
import json
#from config import db_password

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
file_loc = "resources/flights.csv"
df = pd.read_csv(file_loc, low_memory=False)
df.head(5)

In [None]:
print(len(df.index))

In [None]:
# drop all rows with cancelled flights

df = df.drop(df[df['CANCELLED'] == 1].index)
df = df.drop(df[df['DIVERTED'] == 1].index)

# Drop year: data is from 2015, 
# drop cancelled and diverted rows / columns as we are concerned with flights that were completed, with a delay time
# Departure time: predicting delay, having sceduled time and departure time will be giving the answer to the ml model
# Linear dependancy, drop redundant columns 
#     SCHEDULED_DEPARTURE - DEPARTURE_TIME = DEPARTURE_DELAY   drop DEPARTURE_TIME
#     TAXI_OUT + WHEELS_OFF + SCHEDULED_TIME = ELAPSED_TIME    drop TAXI_OUT, WHEELS_OFF, SCHEDULED_TIME
#     ARRIVAL_DELAY = AIR_SYSTEM_DELAY + SECURITY_DELAY + AIRLINE_DELAY + LATE_AIRCRAFT_DELAY + WEATHER_DELAY      drop ?

df.drop(columns=['YEAR', 'CANCELLED', 'CANCELLATION_REASON', 'DIVERTED', 'DEPARTURE_TIME', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_TIME'], inplace=True)



df.head()

In [None]:
# only keep trips to/from airports with over 20000 visits

visits_totals = df['ORIGIN_AIRPORT'].value_counts()
airport_dict = {}
for i in range(len(visits_totals)):
    if visits_totals[i] > 20000:
        airport_dict[visits_totals.index[i]] = i+1  # i+1 to prevent a 0 for encoding later on 

# Creates a dictionary of airport codes, with a value for encoding, alphabetized for later use with project visualizations
sorted_airport_dict = collections.OrderedDict(sorted(airport_dict.items()))

In [None]:
# Remove all rows with airports not in airport_dict (only concerned with popular locations)

# Get list of airports to be removed
all_airpots = visits_totals.index
kept_airports = airport_dict.keys()
to_drop = list(all_airpots)
for element in kept_airports:
    to_drop.remove(element)


to_drop.append('13930')
to_drop.append('10397')
to_drop.append('11298')

for airport in to_drop:
    df.drop(df[df['ORIGIN_AIRPORT'] == airport].index, inplace=True)
    df.drop(df[df['DESTINATION_AIRPORT'] == airport].index, inplace=True)
    print(airport)
    

In [None]:
print(len(df.index))

In [None]:
# Get dictionary for encoding AIRLINE
airlines_totals = df['AIRLINE'].value_counts()
airlines_dict = {}
for i in range(len(airlines_totals)):
    airlines_dict[airlines_totals.index[i]] = i+1  # i+1 to prevent a 0 for encoding later on 

In [None]:
# Export airport dictionaries to CSV

# Saving airport dictionary
airport_file = open("resources/airport_dict.json", "w")
json.dump(airport_dict, airport_file)
airport_file.close()

# TO read
#airport_file = open("resources/airport_dict.json", "r")
#output = airport_file.read()
#print(output)


# Saving airline dictionary
airline_file = open("resources/airline_dict.json", "w")
json.dump(airlines_dict, airline_file)
airline_file.close()

In [None]:
#df['ORIGIN_AIRPORT'].value_counts()

In [None]:
#df['DESTINATION_AIRPORT'].value_counts()

In [None]:
#df['AIRLINE'].value_counts()

In [None]:
df.head(3)

In [None]:
# Randomize dataset
df = df.sample(frac=1, random_state=1)

In [None]:
df.head(3)

In [None]:
#Export cleaned dataset


df.to_csv("resources/flights_cleaned.csv")

In [None]:
df.isna().sum()