In [1]:
from pickle import *
import numpy as np

model = load(open("flights.pickle.dat", 'rb'))



In [2]:
from flights_v7 import *

In [3]:
# Time: departure hour - model cant work with CRS Dep Time (mins goes to 60)
# Avg_tax_out: average taxi out by airline and airport - can't use individual taxi out for new flights - MinMaxScaler 
# Avg_tax_in: average taxi in by airline and airport - MinMaxScaler
# Airline_delay: average delay by airline and airport - MinMaxScaler

In [4]:
# revised recommender function - works with arrival time

def recommend_probs(arrival_time, origin, destination, airline = None, threshold = 2, rec_data = data, model = model):
    
    # Transform arrival_time into an integer and military format
    
    hour = arrival_time[11:13]
    minutes = arrival_time[14:] 
    latest_time = int(hour + minutes)
    
    # Search for appropiate flights
    
    rec_data = rec_data[rec_data["CRS_ARR_TIME"] < latest_time]
    
    # Parse month and day of the week (why only monday and sunday)
    
    from datetime import datetime
    
    try:
        arrival_time_parsed = datetime.strptime(arrival_time, '%d/%m/%Y %H:%M')
    except ValueError as e:
        return 'Error parsing date/time - {}'.format(e)

    day_of_week = arrival_time_parsed.isoweekday()
    month = arrival_time_parsed.month
    
    # Filter the appropiate day of the week
    
    rec_data = rec_data[rec_data["DAY_OF_WEEK"] == day_of_week]
    rec_data = rec_data[rec_data["MONTH"] == month]
            
    # Filter out the airlines
    
    airlines = ['9E','AA', 'AS', 'DL', 'EV', 'F9', 'MQ', 'NK', 'OH', 'OO', 'UA','WN', 'YV', 'YX']
    
    if airline in airlines:
            
            rec_data = rec_data[rec_data["airline_" + airline] == 1]
            
    # Filter out the destination and origin
    
    rec_data = rec_data[rec_data["origin_" + origin] == 1]
    rec_data = rec_data[rec_data["dest_" + destination] == 1]
    
    # Apply threshold for earliest arrival time
    
    earliest_time = latest_time - (threshold * 100)
    
    rec_data = rec_data[rec_data["CRS_ARR_TIME"] > earliest_time]
    
    # Prepare the data to be inputed to the model to predict delats
    
    model_data = rec_data.drop(["MONTH", "CRS_DEP_TIME", "CRS_ARR_TIME", "ARR_DEL15"], axis = 1)
    
    # Predict the selected flights
  
    y_pred = model.predict_proba(model_data.values)
    predictions = y_pred[:,0]
    
    # Attach predictions into the model data to create the output data
    
    rec_data["Probability_delay"] = predictions
    
    return rec_data


def recommend_output(arrival_time, origin, destination, airline = None, threshold = 2, rec_data = data, model = model):
    
    print('\033[1m' + "Origin: " + origin + '\033[0m')
    print('\033[1m' + "Destination: " + destination + '\033[0m')
    
    if airline != None:
    
        print('\033[1m' + "Airline: " + airline + '\033[0m')
        
    # Get the probability predictions
    
    data = recommend_probs(arrival_time, origin, destination, airline = None, threshold = 2, rec_data = rec_data, model = model)
    
    # Create the date column
    
    date_list = []
    
    for x in range(len(data)):
        
        date_list.append(arrival_time[:10])
    
    data["DATE"] = date_list
    
    # Prepare airline column if there is no input value (go through each instance and check whether its a one, and select that column value)
    
    if airline == None:
        
        airline_table = data.filter(regex = "airline").drop(["airline_delay"], axis = 1)
    
        airline = []
    
        for x in np.arange(len(airline_table)):
    
            for y in np.arange(len(airline_table.columns)): 
        
                if airline_table.iloc[x][y] == 1:
                
                    # string = airline_table.columns[y]
                
                    airline.append(airline_table.columns[y][8:])
                
        # Prepare output
    
        data["Airline"] = airline
    
        output_data = data[["DATE", "CRS_DEP_TIME", "CRS_ARR_TIME", "Airline", "Probability_delay"]]
        
    else:
        
        output_data = data[["DATE", "CRS_DEP_TIME", "CRS_ARR_TIME", "Probability_delay"]]
        
    # Format output data: rename columns, time in military format and probability of delay into percentage
    
    output_data = output_data.rename(columns = {"DATE": "Date", "CRS_DEP_TIME": "Departure Time", "CRS_ARR_TIME": "Arrival Time", "Probability_delay": "Probability of Delay"})
    
    output_data["Departure Time"] = output_data["Departure Time"].apply(lambda d: str(d)[:2] + ":" + str(d)[2:])
    
    output_data["Arrival Time"] = output_data["Arrival Time"].apply(lambda d: str(d)[:2] + ":" + str(d)[2:])
    
    output_data["Probability of Delay"] = pd.Series(["{0:.2f}%".format(val * 100) for val in output_data['Probability of Delay']], index = output_data.index)

    output_data.sort_values("Probability of Delay", ascending = True, inplace = True)
    
    output_data["Probability of Delay"] = output_data["Probability of Delay"].apply(lambda d: str(d))
    
    return output_data

# Origin and Destination above table (Airline also if the customer does input it)
# Date (Date as the input)
# CRS DEP TIME and CRS ARR TIME - Departue Time and Arrival - normal not military
# Airline (only if the customer doesnt input it)
# Probability of delay

In [5]:
recommend_output("18/11/2020 20:30", "DEN", "DFW", threshold = 3)

[1mOrigin: DEN[0m
[1mDestination: DFW[0m


Unnamed: 0,Date,Departure Time,Arrival Time,Airline,Probability of Delay
6780929,18/11/2020,16:37,19:40,AA,36.43%
6780935,18/11/2020,16:37,19:40,AA,36.43%
6780941,18/11/2020,16:37,19:40,AA,36.43%
6780948,18/11/2020,16:47,19:50,AA,36.43%
6274911,18/11/2020,17:25,20:22,UA,51.53%
6281687,18/11/2020,17:25,20:22,UA,51.53%
6437490,18/11/2020,17:25,20:22,UA,51.53%
6668109,18/11/2020,17:25,20:22,UA,51.53%
6769803,18/11/2020,16:50,19:47,NK,61.48%


In [6]:
# Same flights? - repeated data from different days but as same date
# Split rec - by output OR Have the table all in the same format 
