In [None]:
import csv
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import datetime
import numpy as np

In [None]:
data = pd.read_csv(r"2016_to_2020_flight_reduced_columns_with_weather_top_10.csv", index_col = False)
data = data.drop('Unnamed: 0', axis = 1)

In [None]:
data.columns

In [None]:
data.head(10)

## Missing/Corrupted Data

In [None]:
# Remove cancelled flights
df = data[data.Cancelled == 0]

In [None]:
# Remove data without ArrTime
df = df[~df.ArrTime.isnull()]
# Reset index
df.reset_index(inplace = True, drop = True)
# Drop column CancellationCode, Flights
# Reporting_Airline and DOT_ID_Reporting_Airline the same info, keep DOT_ID_Reporting_Airline
df.drop(columns = ['CancellationCode','Flights'], inplace = True)

In [None]:
df.shape

In [None]:
missing_stats = df.isnull().sum()
missing_stats = missing_stats[missing_stats != 0]

In [None]:
missing_stats

In [None]:
# Fill DepDelay null with 0
df.DepDelay.fillna(0,inplace=True)

In [None]:
# Fill DivArrDelay with 0
df.DivArrDelay.fillna(0,inplace=True)
# Fill DivActualElapsedTime with 0
df.DivActualElapsedTime.fillna(0,inplace=True)
#Concatenating Reporting_Airline with Flight_Number_Reporting_Airline to form flight number
df.Flight_Number_Reporting_Airline = df.Reporting_Airline.astype(str) + df.Flight_Number_Reporting_Airline.astype(str)
df.drop(columns = ['Reporting_Airline'], inplace = True)
# Change dtype of FlightDate into datetime.date
df['FlightDate'] = df['FlightDate'].apply(lambda x: datetime.date.fromisoformat(x))

In [None]:
def parse_int_to_time(num):
    try:
        num = int(num)
        if num == 2400:
            num = 2359
        m = num%100
        minute = m%60
        h = num//100
        return datetime.time(hour = h, minute = m)
    except ValueError:
        print(num)

In [None]:
# Parse CRSDepTime, DepTime, WheelsOff, WheelsOn, CRSArrTime, ArrTime into datetime
for col in ['CRSDepTime', 'DepTime', 'WheelsOff', 'WheelsOn', 'CRSArrTime', 'ArrTime']:
    df[col] = df[col].apply(lambda x: parse_int_to_time(x))

In [None]:
def fill_ArrDelay(x):
    if np.isnan(x['ArrDelay']):
        start_time = x['CRSArrTime']
        stop_time = x['ArrTime']
        date = datetime.date(1, 1, 1)
        datetime1 = datetime.datetime.combine(date, start_time)
        datetime2 = datetime.datetime.combine(date, stop_time)
        time_elapsed = datetime1 - datetime2
        x['ArrDelay'] = time_elapsed.seconds//60
        return x

In [None]:
# Calculate ArrDelay for missing data
df2 = df[df['ArrDelay'].isnull()]
df2 = df2.apply(fill_ArrDelay, axis = 1)
df[df['ArrDelay'].isnull()] = df2

## Feature engineering & Encoding

In [None]:
df.dtypes

In [None]:
def real_encoding(sev):
    """ Convert severity into real_encoding, from light 1 to severe 4
    nan as 0
    """
    if sev == 'Light':
        return 1
    elif sev == 'Moderate':
        return 2
    elif sev == 'Heavy':
        return 3
    elif sev == 'Severe':
        return 4
    else:
        return 0

In [None]:
# real_encoding for weather severity
for col in ['Severe-Cold_Severity','Fog_Severity','Hail_Severity','Rain_Severity',
            'Snow_Severity','Storm_Severity','Other Precipitation_Severity']:
    df[col] = df[col].apply(lambda x: real_encoding(x))