In [95]:
# import all the packages
import pandas as pd
import numpy as np
import math
import threading
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from scipy.stats import ttest_ind


# load the data
data = pd.read_csv("flights.csv")
airlines = pd.read_csv("airlines.csv")
airports = pd.read_csv("airports.csv")


def preprocess(data, n):

    def clean_variables_no_info(data):
        """
        Description: drops columns that provide no information

        :param data: flights dataframe
        :return: returns cleaned dataframe
        """

        NAs_to_remove = ["Quarter", "Flight_Number_Reporting_Airline", "Div5TailNum", "Div5WheelsOff",
                         "Div5LongestGTime",
                         "Div5LongestGTime", "Div5TotalGTime", "Div5WheelsOn", "Div5AirportSeqID",
                         "Div5AirportID", "Div5Airport", "Div4TailNum", "Div4WheelsOff", "Div4LongestGTime",
                         "Div4TotalGTime", "Div4WheelsOn", "Div4AirportSeqID", "Div4AirportID", "Div4Airport",
                         "Div3TailNum", "Div3WheelsOff", "Div3LongestGTime", "Div3TotalGTime", "Div3WheelsOn",
                         "Div3AirportSeqID", "Div3AirportID", "Div3Airport", "Div2TailNum", "Div2WheelsOff",
                         "Div2LongestGTime", "Div2TotalGTime", "Div2WheelsOn", "Div2AirportSeqID", "Div2AirportID",
                         "Div2Airport", "Div1TailNum", "Div1WheelsOff", "Div1LongestGTime", "Div1TotalGTime",
                         "Div1WheelsOn", "Div1AirportSeqID", "Div1AirportID", "Div1Airport", "DivDistance",
                         "DivArrDelay", "DivActualElapsedTime", "DivReachedDest", "DivAirportLandings",
                         "LongestAddGTime", "DistanceGroup", "CancellationCode", "DestState", "DestStateFips",
                         "DestStateName", "DestWac", "OriginWac", "OriginStateName", "OriginStateFips",
                         "OriginState", "FirstDepTime", "TotalAddGTime", "Cancelled", "Unnamed: 109", "OriginCityName",
                         "ArrDelay", "DepTime", "WheelsOff", "WheelsOn"]

        cleaned_data = data.drop(NAs_to_remove, axis=1)

        return cleaned_data

    def clean_variables_multcol(data):
        """
        Description: drops columns that can create multicollinearity issues

        :param data: flights dataframe
        :return: cleaned dataframe
        """

        other_columns = ["FlightDate", "Reporting_Airline", "Tail_Number",
                         "DOT_ID_Reporting_Airline",
                         "OriginAirportID", "OriginAirportSeqID", "OriginCityMarketID",
                         "DestAirportID", "DestAirportSeqID", "DestCityMarketID", "DestCityName",
                         "DepDelay", "DepartureDelayGroups", "DepTimeBlk", "ArrTime", "ArrivalDelayGroups",
                         "ArrTimeBlk",
                         "Diverted", "AirTime", "Flights"]

        cleaned_data = data.drop(other_columns, axis=1)

        return cleaned_data

    def clean_variables_delay(data):
        """
        Description: drops delay related columns out of the flight dataset

        :param data: dataframe we want to clean
        :return: cleaned dataframe
        """

        pot_vars = ["CarrierDelay", "WeatherDelay",
                    "NASDelay", "SecurityDelay", "LateAircraftDelay"]

        cleaned_data = data.drop(pot_vars, axis=1)

        return cleaned_data

    def delaytype(data):
        """
        Description: creates a new column that categorizes the delay in minutes into 7 categories

        :param data: flights data with a delay column in minutes
        :return: dataframe with the new column
        """

        my_list = []

        for x in data["DepDelayMinutes"]:
            if x < 15:
                my_list.append(0)
            elif x < 30:
                my_list.append(1)
            elif x < 45:
                my_list.append(2)
            elif x < 60:
                my_list.append(3)
            elif x < 90:
                my_list.append(4)
            elif x < 120:
                my_list.append(5)
            else:
                my_list.append(6)

        data["Del_type"] = my_list

        return data

    def round_hours(data):
        """
        Description: transforms the appropiate time columns into hours in the flights dataset

        :param data: flights dataframe
        :return: dataset with the new hour columns
        """

        data.dropna(subset=["CRSDepTime", "CRSArrTime"], inplace=True)

        return_hour(data, "CRSDepTime")
        return_hour(data, "CRSArrTime")

        data.drop(["CRSDepTime",  "CRSArrTime"], axis=1, inplace=True)

        return data

    def return_hour(data, column):
        """
        Description: returns the hour rounded down for a time element in military format

        :param data: dataframe with the new column
        :param column: time column to be transformed
        :return: dataframe with the new column
        """

        my_list = []

        for x in data[column]:
            a = math.floor(x / 100)
            my_list.append(a)

        data[column + "Hour"] = my_list

        return data

    def dummy_generator(data, column, airports):
        """
        Description: Generates dummy variables for a specified list of airports (1 if the airport specified, 0 if not)

        :param data: dataframe where one wants to add the dummy column
        :param column: string indicating which column the function will look at to generate the dummys
        :param airports: list indicating the number of dummy variables to be generate
        :return: dataframe with the new columns
        """

        for x in airports:
            my_list = []
            for y in data[column]:
                if x == y:
                    my_list.append(1)
                else:
                    my_list.append(0)
            data["dummy" + str(column) + str(x)] = my_list

        return data

    n = number_of_airports_included

    main_origins = data[["Origin", "Dest"]].groupby(["Origin"]).count().sort_values(by="Dest", ascending=False).head(
        n).reset_index()["Origin"]

    main_dests = data[["Dest", "Origin"]].groupby(["Dest"]).count().sort_values(by="Origin", ascending=False).head(
        n).reset_index()["Dest"]
    
  #  main_airline = data[["IATA_CODE_Reporting_Airline", "Origin"]].groupby(["IATA_CODE_Reporting_Airline"]).count().sort_values(by="Origin", ascending=False).head(
   #     n).reset_index()["IATA_CODE_Reporting_Airline"]

    data = data[data["Origin"].isin(main_origins) & data["Dest"].isin(main_dests)]

    

    data = clean_variables_no_info(data)
    data = clean_variables_multcol(data)
    data = clean_variables_delay(data)
    data = delaytype(data)
    data = round_hours(data)
    data = dummy_generator(data, "Origin", main_origins)
    data = dummy_generator(data, "Dest", main_dests)
#    data = dummy_generator(data, "IATA_CODE_Reporting_Airline", main_airline)

#    data = pd.get_dummies[data[["IATA_CODE_Reporting_Airline","Origin", "Dest"]]]
 #   data = pd.get_dummies(data, columns=["IATA_CODE_Reporting_Airline","Origin", "Dest"])
#    data.drop(["IATA_CODE_Reporting_Airline", "Origin", "Dest"], axis=1, inplace=True)
    data.drop(["Origin", "Dest"], axis=1, inplace=True)

    return data


number_of_airports_included = 10
data = preprocess(data, number_of_airports_included)


def Airport(airports):
    # Function finds city and airport
    identify_airport = airports.set_index('IATA_CODE')['CITY'].to_dict()
    latitude_airport = airports.set_index('IATA_CODE')['LATITUDE'].to_dict()
    longitude_airport = airports.set_index('IATA_CODE')['LONGITUDE'].to_dict()

    return


def Airlines(self):
    # Function locates all the different airlines and names them

    return


def DelayType(self):
    # Function finds categorizes delay into groups based on minute
    # For example early arrival, on time, less than 15 min etc..
    def delay_type(x): return ((0, 1)[x > 5], 2)[x > 30]
    df['DELAY_LEVEL'] = df['DEPARTURE_DELAY'].apply(delay_type)
    return


def DelaySort(self):
    # Function that finds the reason for the delay
    # Either delay in departure or arrival
    # i.e. caused by extra unexpected flight duration or not

    return


def CompareAirlines(self):
    # Function that compares the airlines on:
    # Basic statistical description of airlines
    # Delays distribution: establishing the ranking of airlines
    def get_stats(group):
        return {'min': group.min(), 'max': group.max(),
                'count': group.count(), 'mean': group.mean()}

    global_stats = df['DEPARTURE_DELAY'].groupby(df['AIRLINE']).apply(get_stats).unstack()
    global_stats = global_stats.sort_values('count')
    return global_stats



  interactivity=interactivity, compiler=compiler, result=result)


In [96]:
data.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,IATA_CODE_Reporting_Airline,DepDelayMinutes,DepDel15,TaxiOut,TaxiIn,ArrDelayMinutes,...,dummyDestATL,dummyDestORD,dummyDestDFW,dummyDestDEN,dummyDestCLT,dummyDestLAX,dummyDestIAH,dummyDestPHX,dummyDestLGA,dummyDestSFO
882,2019,11,1,5,OO,0.0,0.0,29.0,9.0,0.0,...,0,0,0,0,0,1,0,0,0,0
950,2019,11,1,5,OO,0.0,0.0,13.0,10.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2101,2019,11,1,5,OO,0.0,0.0,23.0,8.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2105,2019,11,1,5,OO,0.0,0.0,19.0,11.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2200,2019,11,1,5,OO,0.0,0.0,26.0,6.0,11.0,...,0,0,0,1,0,0,0,0,0,0


In [97]:
data.isnull().sum()

Year                             0
Month                            0
DayofMonth                       0
DayOfWeek                        0
IATA_CODE_Reporting_Airline      0
DepDelayMinutes                327
DepDel15                       327
TaxiOut                        333
TaxiIn                         340
ArrDelayMinutes                377
ArrDel15                       377
CRSElapsedTime                   0
ActualElapsedTime              377
Distance                         0
Del_type                         0
CRSDepTimeHour                   0
CRSArrTimeHour                   0
dummyOriginATL                   0
dummyOriginORD                   0
dummyOriginDFW                   0
dummyOriginDEN                   0
dummyOriginCLT                   0
dummyOriginLAX                   0
dummyOriginIAH                   0
dummyOriginPHX                   0
dummyOriginLGA                   0
dummyOriginSFO                   0
dummyDestATL                     0
dummyDestORD        

In [98]:
data["DayOfWeek"].unique()
list(data.columns)
data = data.drop(["Year","TaxiOut","TaxiIn","DepDelayMinutes",
                 "DepDel15","ArrDelayMinutes","CRSElapsedTime",
                 "Distance","Del_type", "ActualElapsedTime", "CRSArrTimeHour"],axis=1)        
data.head()
#list(data.columns)

Unnamed: 0,Month,DayofMonth,DayOfWeek,IATA_CODE_Reporting_Airline,ArrDel15,CRSDepTimeHour,dummyOriginATL,dummyOriginORD,dummyOriginDFW,dummyOriginDEN,...,dummyDestATL,dummyDestORD,dummyDestDFW,dummyDestDEN,dummyDestCLT,dummyDestLAX,dummyDestIAH,dummyDestPHX,dummyDestLGA,dummyDestSFO
882,11,1,5,OO,0.0,8,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
950,11,1,5,OO,0.0,18,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2101,11,1,5,OO,0.0,9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2105,11,1,5,OO,0.0,14,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2200,11,1,5,OO,0.0,20,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [99]:
data = data.rename(columns={"IATA_CODE_Reporting_Airline": "Airline"})

In [100]:
data.isnull().sum()

Month               0
DayofMonth          0
DayOfWeek           0
Airline             0
ArrDel15          377
CRSDepTimeHour      0
dummyOriginATL      0
dummyOriginORD      0
dummyOriginDFW      0
dummyOriginDEN      0
dummyOriginCLT      0
dummyOriginLAX      0
dummyOriginIAH      0
dummyOriginPHX      0
dummyOriginLGA      0
dummyOriginSFO      0
dummyDestATL        0
dummyDestORD        0
dummyDestDFW        0
dummyDestDEN        0
dummyDestCLT        0
dummyDestLAX        0
dummyDestIAH        0
dummyDestPHX        0
dummyDestLGA        0
dummyDestSFO        0
dtype: int64

In [101]:
data[data["ArrDel15"].isnull()]

Unnamed: 0,Month,DayofMonth,DayOfWeek,Airline,ArrDel15,CRSDepTimeHour,dummyOriginATL,dummyOriginORD,dummyOriginDFW,dummyOriginDEN,...,dummyDestATL,dummyDestORD,dummyDestDFW,dummyDestDEN,dummyDestCLT,dummyDestLAX,dummyDestIAH,dummyDestPHX,dummyDestLGA,dummyDestSFO
18335,11,29,5,OO,,16,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
18383,11,29,5,OO,,11,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
18556,11,29,5,OO,,17,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
18659,11,29,5,OO,,6,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
18667,11,29,5,OO,,11,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
18746,11,29,5,OO,,19,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
18748,11,29,5,OO,,13,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
20650,11,30,6,OO,,17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
20834,11,30,6,OO,,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
20856,11,30,6,OO,,17,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [102]:
# need to change NAs to 1 because these flights were cancelled
data = data.fillna({'ArrDel15': 1})

In [103]:
# can add if think that necessary

#data["Airline"].unique()
#data = pd.get_dummies(data, columns=["Airline"])

# remove airline

data = data.drop(["Airline"], axis=1)

In [104]:
data.isnull().sum()

Month             0
DayofMonth        0
DayOfWeek         0
ArrDel15          0
CRSDepTimeHour    0
dummyOriginATL    0
dummyOriginORD    0
dummyOriginDFW    0
dummyOriginDEN    0
dummyOriginCLT    0
dummyOriginLAX    0
dummyOriginIAH    0
dummyOriginPHX    0
dummyOriginLGA    0
dummyOriginSFO    0
dummyDestATL      0
dummyDestORD      0
dummyDestDFW      0
dummyDestDEN      0
dummyDestCLT      0
dummyDestLAX      0
dummyDestIAH      0
dummyDestPHX      0
dummyDestLGA      0
dummyDestSFO      0
dtype: int64

In [105]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(data.drop('ArrDel15', axis=1), 
                                                    data['ArrDel15'], test_size=0.2, random_state=42)




In [106]:
train_x.shape

(31160, 24)

In [107]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=13)
model.fit(train_x, train_y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=13, verbose=0,
                       warm_start=False)

In [108]:
model.score(train_x, train_y)

0.945795892169448

In [109]:
predicted = model.predict(test_x)
model.score(test_x, test_y)

0.8254396098061866

In [110]:
def predict_delay(departure_date_time, origin, destination):
    from datetime import datetime

    try:
        departure_date_time_parsed = datetime.strptime(departure_date_time, '%d/%m/%Y %H:%M:%S')
    except ValueError as e:
        return 'Error parsing date/time - {}'.format(e)

    month = departure_date_time_parsed.month
    day = departure_date_time_parsed.day
    day_of_week = departure_date_time_parsed.isoweekday()
    hour = departure_date_time_parsed.hour

   # airline = airline.upper()
    origin = origin.upper()
    destination = destination.upper()

    input = [{'Month': month,
              'DayofMonth': day,
              'DayOfWeek': day_of_week,
              'CRSDepTimeHour': hour,
            
              #airlines
         #    'Airline_9E': 1 if airline == '9E' else 0,
         #    'Airline_AA': 1 if airline == 'AA' else 0,
         #    'Airline_AS': 1 if airline == 'AS' else 0,
         #    'Airline_DL': 1 if airline == 'DL' else 0,
         #    'Airline_EV': 1 if airline == 'EV' else 0,
         #    'Airline_F9': 1 if airline == 'F9' else 0,
         #    'Airline_MQ': 1 if airline == 'MQ' else 0,
         #    'Airline_NK': 1 if airline == 'NK' else 0,
         #    'Airline_OH': 1 if airline == 'OH' else 0,
         #    'Airline_OO': 1 if airline == 'OO' else 0,
         #    'Airline_UA': 1 if airline == 'UA' else 0,
         #    'Airline_WN': 1 if airline == 'WN' else 0,
         #    'Airline_YV': 1 if airline == 'YV' else 0,
         #    'Airline_YX': 1 if airline == 'YX' else 0,
              
              #origins
              'dummyOriginATL': 1 if origin == 'ATL' else 0,
              'dummyOriginORD': 1 if origin == 'ORD' else 0,
              'dummyOriginDFW': 1 if origin == 'DFW' else 0,
              'dummyOriginDEN': 1 if origin == 'DEN' else 0,
              'dummyOriginCLT': 1 if origin == 'CLT' else 0,
              'dummyOriginLAX': 1 if origin == 'LAX' else 0,
              'dummyOriginIAH': 1 if origin == 'IAH' else 0,
              'dummyOriginPHX': 1 if origin == 'PHX' else 0,
              'dummyOriginLGA': 1 if origin == 'LGA' else 0,
              'dummyOriginSFO': 1 if origin == 'SFO' else 0,
              
              #destinations
              'dummyDestATL': 1 if destination == 'ATL' else 0,
              'dummyDestORD': 1 if destination == 'ORD' else 0,
              'dummyDestDFW': 1 if destination == 'DFW' else 0,
              'dummyDestDEN': 1 if destination == 'DEN' else 0,
              'dummyDestCLT': 1 if destination == 'CLT' else 0,
              'dummyDestLAX': 1 if destination == 'LAX' else 0,
              'dummyDestIAH': 1 if destination == 'IAH' else 0,
              'dummyDestPHX': 1 if destination == 'PHX' else 0,
              'dummyDestLGA': 1 if destination == 'LGA' else 0,
              'dummyDestSFO': 1 if destination == 'SFO' else 0,
              }]

    return model.predict_proba(pd.DataFrame(input))[0][0]

In [113]:
predict_delay('12/11/2018 11:45:00', 'JFK', 'ATL')

0.8

In [112]:
# this is the probability that the flight will arrive at the destination on time. 