In [1]:
# Import required libraries.

import pandas as pd
import numpy as np
import time

In [2]:
t_begin = time.time()
t_read = time.time()

In [3]:
# Read all datasets.

jan  = pd.read_csv("datasets/01_flight_details_january.csv")
feb  = pd.read_csv("datasets/02_flight_details_february.csv")
mar  = pd.read_csv("datasets/03_flight_details_march.csv")
apr  = pd.read_csv("datasets/04_flight_details_april.csv")
may  = pd.read_csv("datasets/05_flight_details_may.csv")
jun  = pd.read_csv("datasets/06_flight_details_june.csv")
jul  = pd.read_csv("datasets/07_flight_details_july.csv")
aug  = pd.read_csv("datasets/08_flight_details_august.csv")
sep  = pd.read_csv("datasets/09_flight_details_september.csv")
octo = pd.read_csv("datasets/10_flight_details_october.csv")
nov  = pd.read_csv("datasets/11_flight_details_november.csv")
dec  = pd.read_csv("datasets/12_flight_details_december.csv")


In [4]:
t_read_complete = time.time() - t_read

In [5]:
jan_copy = jan.copy()
feb_copy = feb.copy()
mar_copy = mar.copy()
apr_copy = apr.copy()
may_copy = may.copy()
jun_copy = jun.copy()
jul_copy = jul.copy()
aug_copy = aug.copy()
sep_copy = sep.copy()
octo_copy = octo.copy()
nov_copy = nov.copy()
dec_copy = dec.copy()

In [6]:
# Convert columns to appropriate datatypes.

def convert_variables(data_frame):
    
    t_convert = time.time()
        
    data_frame.FL_DATE = pd.to_datetime(data_frame.FL_DATE)
    
    data_frame.AIRLINE_ID = data_frame.AIRLINE_ID.astype("str")
    
    data_frame.FL_NUM = data_frame.FL_NUM.astype("str")
    
    data_frame.CRS_DEP_TIME = data_frame.CRS_DEP_TIME.astype("str").str.zfill(4).replace("2400", "2359")
    data_frame.CRS_ARR_TIME = data_frame.CRS_ARR_TIME.astype("str").str.zfill(4).replace("2400", "2359")    
    data_frame.CRS_DEP_TIME = pd.to_datetime(data_frame.CRS_DEP_TIME, format="%H%M").dt.time
    data_frame.CRS_ARR_TIME = pd.to_datetime(data_frame.CRS_ARR_TIME, format="%H%M").dt.time
    
    data_frame.DEP_TIME = data_frame.DEP_TIME.replace(np.nan, 0.0)
    data_frame.ARR_TIME = data_frame.ARR_TIME.replace(np.nan, 0.0)
    data_frame.DEP_TIME = data_frame.DEP_TIME.astype("int").astype("str").str.zfill(4).replace("2400", "2359")
    data_frame.ARR_TIME = data_frame.ARR_TIME.astype("int").astype("str").str.zfill(4).replace("2400", "2359")
    data_frame.DEP_TIME = pd.to_datetime(data_frame.DEP_TIME, format="%H%M").dt.time
    data_frame.ARR_TIME = pd.to_datetime(data_frame.ARR_TIME, format="%H%M").dt.time
    
    data_frame.DEP_DELAY_GROUP = data_frame.DEP_DELAY_GROUP.astype("int", errors="ignore")
    data_frame.ARR_DELAY_GROUP = data_frame.ARR_DELAY_GROUP.astype("int", errors="ignore")
    
    data_frame.DEP_DELAY = data_frame.DEP_DELAY.astype("int", errors="ignore")
    data_frame.ARR_DELAY = data_frame.ARR_DELAY.astype("int", errors="ignore")
    
    data_frame.CANCELLED = data_frame.CANCELLED.astype("bool")
    data_frame.DIVERTED = data_frame.DIVERTED.astype("bool")
    
    data_frame.TAIL_NUM = data_frame.TAIL_NUM.replace(np.NaN, "CANCELLED")
    data_frame["FLIGHT_ID"] = data_frame.index.astype("str").str.zfill(7) + "-" + data_frame["AIRLINE_ID"] + data_frame["TAIL_NUM"]
    
    data_frame.drop(["TAIL_NUM", "Unnamed: 24"], axis=1, inplace=True)
        
    data_frame["ORIGIN_CITY"], data_frame["ORIGIN_STATE"] = data_frame.ORIGIN_CITY_NAME.str.split(", ").str
    data_frame["DEST_CITY"], data_frame["DEST_STATE"] = data_frame.DEST_CITY_NAME.str.split(", ").str
    data_frame.drop(["ORIGIN_CITY_NAME", "DEST_CITY_NAME"], inplace=True, axis=1)
    
    t_convert_complete = time.time() - t_convert
    
    return data_frame, t_convert_complete

In [7]:
def replace_00(data_frame):
    
    t_replace = time.time()
    
    data_frame.CARRIER_DELAY = data_frame.CARRIER_DELAY.replace(0.0, np.nan)
    data_frame.WEATHER_DELAY = data_frame.WEATHER_DELAY.replace(0.0, np.nan)
    data_frame.NAS_DELAY = data_frame.NAS_DELAY.replace(0.0, np.nan)
    data_frame.SECURITY_DELAY = data_frame.SECURITY_DELAY.replace(0.0, np.nan)
    data_frame.LATE_AIRCRAFT_DELAY = data_frame.LATE_AIRCRAFT_DELAY.replace(0.0, np.nan)
    
    t_replace_complete = time.time() - t_replace
    
    return data_frame, t_replace_complete

In [8]:
def flight_status(data_frame):
    
    t_status = time.time()
    
    fls = []
    for index, flight in data_frame.iterrows():
        if flight.CANCELLED:
            FLIGHT_STATUS = "Cancelled"
        elif flight.DIVERTED:
            FLIGHT_STATUS = "Diverted"
        else:
            FLIGHT_STATUS = "Normal"

        fls.append({"FLIGHT_ID": flight.FLIGHT_ID,
                    "FLIGHT_STATUS": FLIGHT_STATUS})

    fls_df = pd.DataFrame(fls)
    data_frame.drop(["CANCELLED", "DIVERTED"], axis=1, inplace=True)
    data_frame = data_frame.merge(fls_df, on="FLIGHT_ID", how="left")
    
    t_status_complete = time.time() - t_status

    return data_frame, t_status_complete

In [9]:
time_list = [0]

In [10]:
jan_copy, a = convert_variables(jan_copy)
jan_copy, b = replace_00(jan_copy)
jan_copy, c = flight_status(jan_copy)
    
time_list.append([a, b, c])

In [11]:
feb_copy, a = convert_variables(feb_copy)
feb_copy, b = replace_00(feb_copy)
feb_copy, c = flight_status(feb_copy)    

time_list.append([a, b, c])

In [12]:
mar_copy, a = convert_variables(mar_copy)
mar_copy, b = replace_00(mar_copy)
mar_copy, c = flight_status(mar_copy)    

time_list.append([a, b, c])

In [13]:
apr_copy, a = convert_variables(apr_copy)
apr_copy, b = replace_00(apr_copy)
apr_copy, c = flight_status(apr_copy)    

time_list.append([a, b, c])

In [14]:
may_copy, a = convert_variables(may_copy)
may_copy, b = replace_00(may_copy)
may_copy, c = flight_status(may_copy)

time_list.append([a, b, c])

In [15]:
jun_copy, a = convert_variables(jun_copy)
jun_copy, b = replace_00(jun_copy)
jun_copy, c = flight_status(jun_copy)

time_list.append([a, b, c])

In [16]:
jul_copy, a = convert_variables(jul_copy)
jul_copy, b = replace_00(jul_copy)
jul_copy, c = flight_status(jul_copy)

time_list.append([a, b, c])

In [17]:
aug_copy, a = convert_variables(aug_copy)
aug_copy, b = replace_00(aug_copy)
aug_copy, c = flight_status(aug_copy)

time_list.append([a, b, c])

In [18]:
sep_copy, a = convert_variables(sep_copy)
sep_copy, b = replace_00(sep_copy)
sep_copy, c = flight_status(sep_copy)

time_list.append([a, b, c])

In [19]:
octo_copy, a = convert_variables(octo_copy)
octo_copy, b = replace_00(octo_copy)
octo_copy, c = flight_status(octo_copy)

time_list.append([a, b, c])

In [20]:
nov_copy, a = convert_variables(nov_copy)
nov_copy, b = replace_00(nov_copy)
nov_copy, c = flight_status(nov_copy)

time_list.append([a, b, c])

In [21]:
dec_copy, a = convert_variables(dec_copy)
dec_copy, b = replace_00(dec_copy)
dec_copy, c = flight_status(dec_copy)

time_list.append([a, b, c])

In [22]:
t_write = time.time()

In [23]:
data_frame_list = [jan_copy, feb_copy, mar_copy, 
                   apr_copy, may_copy, jun_copy, 
                   jul_copy, aug_copy, sep_copy, 
                   octo_copy, nov_copy, dec_copy]
master_df = pd.concat(data_frame_list)

In [24]:
master_df.to_csv("datasets/2018_cleaned_enhanced.csv", index=False)

In [25]:
t_write_complete = time.time() - t_write
t_end = time.time() - t_begin

In [26]:
print("January   : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[1]))
print("February  : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[2]))
print("March     : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[3]))
print("April     : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[4]))
print("May       : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[5]))
print("June      : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[6]))
print("July      : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[7]))
print("August    : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[8]))
print("September : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[9]))
print("October   : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[10]))
print("November  : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[11]))
print("December  : Convert: {}\tReplace: {}\tStatus: {}".format(*time_list[12]))

print("\nTotal     :", t_end)
print("Begin at    :", time.ctime(t_begin))
print("End at      :", time.ctime())

January   : Convert: 11.418611526489258	Replace: 0.03709602355957031	Status: 57.94325828552246
February  : Convert: 10.802898406982422	Replace: 0.029004573822021484	Status: 52.701080322265625
March     : Convert: 12.699680805206299	Replace: 0.04117870330810547	Status: 63.65720176696777
April     : Convert: 12.515369176864624	Replace: 0.03606748580932617	Status: 66.74021482467651
May       : Convert: 12.948500871658325	Replace: 0.03653454780578613	Status: 73.47364354133606
June      : Convert: 13.235288381576538	Replace: 0.03800773620605469	Status: 63.81502294540405
July      : Convert: 13.5190269947052	Replace: 0.038008689880371094	Status: 65.54305338859558
August    : Convert: 13.616312980651855	Replace: 0.03800773620605469	Status: 65.50683355331421
September : Convert: 12.44648289680481	Replace: 0.03300738334655762	Status: 58.652034521102905
October   : Convert: 12.731227397918701	Replace: 0.03500866889953613	Status: 61.759793519973755
November  : Convert: 12.210675954818726	Replace: