In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow as tf
import numpy as np
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
import pandas as pd
from tqdm import tqdm
from xgboost import XGBRegressor
import matplotlib.pyplot as plt



In [None]:

'''
creates training and validation sets and creates a linear regression model '''

trainset = pd.read_csv(filepath) #please only load the preprocessed dataframe with encoded variables and outliers removed for the purpose of training

featureList = ["LATE_AIRCRAFT_DELAY", "NAS_DELAY", "OP_CARRIER", "ORIGIN", "DEST", "CARRIER_DELAY", "DEP_DELAY_NEW"] #this featureList is configurable

trainset = pd.DataFrame(trainset[featureList])



featureList = ["LATE_AIRCRAFT_DELAY", "NAS_DELAY", "OP_CARRIER", "ORIGIN", "DEST", "CARRIER_DELAY"]
def split_data(trainset, featureList, labels):
    X, y = trainset[featureList], trainset[labels]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return (X_train, X_test, y_train, y_test)

def create_linear_model(trainset, labels):
    regressor = LinearRegression()
    regressor.fit(trainset, labels)
    return regressor
    

X_train, X_test, y_train, y_test = split_data(trainset, featureList, "DEP_DELAY_NEW")

model = create_linear_model(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [None]:
''' utility function for creating a Pandas dataframe of the performances of a model based on Mean SquaredError and Explained Variance'''

 While using performanceMeasure, please note:

1. Model parameter is a list consisting of trained models in ascending order.

2. X, y, preds are parameters that are lists consisting of training sets for all models in the beginning in ascending order, 
    followed by testing sets for all models in ascending order.
    
Note: this function will work for 1 or 2 models provided in the model list. We designed it this way since we dealt with 2 different models in the beginnng, 
then proceeded to go off with one model only.'''

def performanceMeasure(model, X, y, preds):
    performance = {}
    
    for i in range(0, len(model)):
        performance["Model" + str(i+1) + "Training"] = []
        performance["Model" + str(i+1) + "Testing"] = []

    no = 0
    for column in performance.keys():
        measures = []
        if no == 0:
            measures.append(model[0].score(X[0], y[0]))
            measures.append(mean_squared_error(y[0], preds[0]))
        if no == 1:
            measures.append(model[0].score(X[2], y[2]))
            measures.append(mean_squared_error(y[2], preds[2]))
        if len(model) > 1:
            
            if no == 2:
                measures.append(model[1].score(X[1], y[1]))
                measures.append(mean_squared_error(y[1], preds[1]))
            if no == 3:
                measures.append(model[1].score(X[3], y[3]))
                measures.append(mean_squared_error(y[3], preds[3]))
        performance[column] = measures
        no += 1
    
    return pd.DataFrame(performance)

In [None]:
''' utility function for creating a SVM model based on the LinearSVR '''
def create_svm_model(X, y, C=0.5):
    regressor = LinearSVR(epsilon=0, C=C, loss="epsilon_insensitive", verbose=1)
    regressor.fit(X, y)
    return regressor

svm_model1 = create_svm_model(X_train, y_train)

In [None]:
''' utility function for running GridSearch on a model provided parameters to tune '''

def runGridSearch(model, parameters, X, y):
    clf = GridSearchCV(model, parameters)
    clf.fit(X, y)
    return clf

In [None]:
''' utility function for transforming the dtypes of the Pandas dataframe to a uniform type. This will be useful while training the neural network'''

def uniformDtypes(dataframe, columns, dtype):
    dataframe[columns] = dataframe[columns].astype(dtype)
    return dataframe

In [None]:
''' this segment converts the Pandas dataframe into a Tensor which is an essential part for the custom training loop in the neural network. '''

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

indices = list(np.random.randint(0, X_train.shape[0], size=19))
X_train.drop(indices, axis=0, inplace=True)
y_train.drop(indices, axis=0, inplace=True)
indices_test = list(np.random.randint(0, 11380, size=80))
X_test.drop(indices_test, axis=0, inplace=True)
y_test.drop(indices_test, axis=0, inplace=True)

BATCH_SIZE = 100
train_set = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
test_set = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))

train_set = train_set.shuffle(buffer_size=len(X_train)).batch(BATCH_SIZE)
test_set = test_set.batch(BATCH_SIZE)


In [None]:
''' defining the core model architecture. The architecture can be modified as well by changing th feedforward pass and the layer sused '''
class DNN(tf.keras.Model):
    def __init__(self, input_shape):
        super(DNN, self).__init__()
        self.initializer = tf.keras.initializers.GlorotUniform()
        self.input_layer = tf.keras.layers.InputLayer(input_shape=input_shape)
        self.dense1 = tf.keras.layers.Dense(units=512, activation='relu', kernel_initializer=self.initializer)
        self.dense2 = tf.keras.layers.Dense(units=256, activation='relu', kernel_initializer=self.initializer)
        self.dense3 = tf.keras.layers.Dense(units=128, activation='relu', kernel_initializer=self.initializer)
        self.dense4 = tf.keras.layers.Dense(units=64, activation='relu', kernel_initializer=self.initializer)
        self.output_layer = tf.keras.layers.Dense(units=1, activation='linear')
        
    
    def call(self, inputs):
        x = self.input_layer(inputs)
        x = self.dense1(x)
        x = self.batchnorm(x)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.dense4(x)
        out_val = self.output_layer(x)
        
        return out_val


In [None]:
''' this is where we have defined the custom training loop, using graph mode. You can use any optimizer, loss, or metric, however, it should inherit from the tf.keras.Loss
tf.keras.Optimizers, and tf.keras.Metrics classes respectively. ''''

from tqdm import tqdm

@tf.function
def apply_gradients(optimizer, loss, model, labels, dataset):
    with tf.GradientTape() as tape:
        logits = model(dataset)
        loss_val = loss(y_true=labels, y_pred=logits)
        
    gradients = tape.gradient(loss_val, model.trainable_weights)
    optimizer.apply_gradients(zip(gradients,  model.trainable_weights))
        
    return logits, loss_val
    
def train_one_epoch(train, train_acc_metric, optimizer, loss, model):
    losses = []
    pbar = tqdm(total=len(list(enumerate(train))), position=0, leave=True, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}')
    
    for step, (x_batch_train, y_batch_train) in enumerate(train):
        logits, loss_value = apply_gradients(optimizer, loss, model, y_batch_train, x_batch_train)
        losses.append(loss_value)
        train_acc_metric(y_batch_train, logits)
        pbar.set_description("Training loss for step %s: %.4f" % (int(step), float(loss_value)))
        pbar.update()
    return losses


def perform_validation(test, model, loss, val_acc_metric):
    losses = []
    for x_val, y_val in test:
        val_logits = model(x_val)
        val_loss = loss(y_true=y_val, y_pred=val_logits)
        losses.append(val_loss)
        val_acc_metric(y_val, val_logits)
    
    return losses

def train_n_epochs(train, test, loss, optimizer, model, epochs, train_acc_metric, val_acc_metric):
    epoch_train_losses, epoch_val_losses = [], []
    history = {}
    for epoch in range(epochs):
        losses_train = train_one_epoch(train, train_acc_metric, optimizer, loss, model)
        train_acc_metric.result()
        losses_val = perform_validation(test, model, loss, val_acc_metric)
        val_acc_metric.result()
        mean_train_loss = np.mean(losses_train)
        mean_val_loss = np.mean(losses_val)
        epoch_train_losses.append(mean_train_loss)
        epoch_val_losses.append(mean_val_loss)
        history['epoch' + str(epoch+1) + 'train'] = mean_train_loss
        history['epoch' + str(epoch+1) + 'val'] = mean_val_loss
        print('\n Epoch %s: Train loss: %.4f  Validation Loss: %.4f, Train Accuracy: %.4f, Validation Accuracy %.4f' % (epoch, float(mean_train_loss), float(mean_val_loss), float(train_acc_metric.result()), float(val_acc_metric.result())))
        train_acc_metric.reset_states()
        val_acc_metric.reset_states()
    return history


In [None]:
''' utility function for plotting the validation and training losses per epoch. Please use history as defined above. '''

def plot_metrics(history):
    num_epochs = int(len(list(history.keys())) / 2)
    train_losses = []
    val_losses = []
    for epoch in range(num_epochs):
        train_losses.append(history['epoch' + str(epoch+1) + 'train'])
        val_losses.append(history['epoch' + str(epoch+1) + 'val'])
    
    epochs = [epoch for epoch in range(num_epochs)]
    plt.plot(epochs, train_losses)
    plt.xlabel('Epoch')
    plt.ylabel('Training_Loss')
    plt.plot(epochs, val_losses)
    plt.ylabel('Validation Loss')
    plt.xlabel("Epoch")
    plt.show()

In [None]:
''' this segment trains a XGBoost model and a utility function for the mean squared error loss function to plot the losses for the xgboost predictions'''
model = XGBRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_test)

def loss_function(y_true, y_pred):
    return np.mean(np.square(y_true-y_pred))

preds_train = model.predict(X_train)
preds_test = model.predict(X_test)
loss_train = loss_function(y_train, preds_train)
loss_test = loss_function(y_test, preds_test)
print(f"The train loss is {loss_train}")
print(f"The test loss is {loss_test}")



In [None]:
''' utility function for plotting the feature importances of the xgboost model '''

def feature_importances(model, featureList):
    f = plt.figure(figsize=(16, 10))
    plt.plot([feature for feature in featureList], model.feature_importances_)
    plt.xlabel("Features")
    plt.ylabel("Feature Importance Scores")
    plt.show()