In [51]:
%load_ext autoreload
%autoreload 2
from final_project.models import FeedForward
from final_project import builder
from tqdm import tqdm
import torch
import torch.nn as nn
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from final_project.loader import get_df

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
def plot_freq(title, data):
    
    fig, ax = plt.subplots(figsize=(6,3))
    bars = ax.bar(['delayed' if x==1 else 'on time' for x in data[0]], data[1])

    ax.set_title(f'Class Instances in {title.capitalize()} Dataset')
    ax.set_xlabel('Count')
    ax.set_ylabel('Labels')

    print(data[1])
    ax.bar_label(bars, data[1])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Show the chart
    plt.show()

In [53]:
master_df = get_df(file="Flights_2018_1.csv")
# builder.runEDA(master_df)
master_df = builder.encodeFrame(master_df)

   Quarter  DayOfWeek  FlightDate Marketing_Airline_Network  \
0        1          2  2018-01-23                        DL   
1        1          3  2018-01-24                        DL   
2        1          4  2018-01-25                        DL   
3        1          5  2018-01-26                        DL   
4        1          6  2018-01-27                        DL   

  Operated_or_Branded_Code_Share_Partners  DOT_ID_Marketing_Airline  \
0                            DL_CODESHARE                     19790   
1                            DL_CODESHARE                     19790   
2                            DL_CODESHARE                     19790   
3                            DL_CODESHARE                     19790   
4                            DL_CODESHARE                     19790   

  IATA_Code_Marketing_Airline  Flight_Number_Marketing_Airline  \
0                          DL                             3298   
1                          DL                             3298

In [None]:
'''
process data for training, split into test/train/validation
'''
X = master_df.drop(columns=["ArrDel15"])
y = master_df[["ArrDel15"]]
y = y.ArrDel15.ravel() # flatten
print("# samples:", y.shape[0])

# split into train and test/validation (which is then split in next line)
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=150)

# create validation and test sets each 15% of total data
X_test, X_validation, y_test, y_validation = train_test_split(X_test_val, y_test_val,
                                                    test_size=0.5,
                                                    random_state=150)
data = {"train": (X_train,y_train), "test":(X_test,y_test), "validation": (X_validation,y_validation)}

# number of classes, number of instances in each class
for each in data.keys():
    print(f"{each}:")
    print(" - Number of features: ", len(data[each][0].columns))
    print(" - Number of samples: ", len(data[each][0]))
    unique, counts = np.unique(data[each][1], return_counts=True)
    plot_freq(each, (unique, counts))
    print()

In [None]:
# create train numpy arrays
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
X_validation = X_validation.to_numpy()

# convert to tensors
X_train, y_train, X_test, y_test, X_validation, y_validation = map(
    torch.tensor, (X_train, y_train, X_test, y_test, X_validation, y_validation)
)

In [None]:
# create dataset and dataloader
train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)
valid_ds = TensorDataset(X_validation, y_validation)

In [None]:
loss_function = nn.BCEWithLogitsLoss()
num_features = X_train.shape[1]
classes = y_train.unique()

In [None]:
# exhaustive hyperparameter tuning based on the best final validation loss
def ffn_tune(num_hidden_layers, num_nodes, param_dict):
    best_model = {key:param_dict[key] for key in param_dict}
    best_model["best_loss"] = 100000000000
    for bs in param_dict["bs"]:
        for epoch in param_dict["epoch"]:
            for lr in param_dict["learning_rate"]:
                # use validation loss
                model = FeedForward(num_hidden_layers, num_nodes, num_features)
                training_losses, valid_losses = model.fit(train_ds, valid_ds, bs, epoch, loss_function, lr)
                if valid_losses[-1] < best_model["best_loss"]:
                    best_model["model"]=model
                    best_model["best_loss"] = valid_losses[-1]
                    best_model["epoch"] = epoch
                    best_model["learning_rate"] = lr
                    best_model["bs"] = bs
                    best_model["valid_losses"] = valid_losses
                    best_model["training_losses"] = training_losses
                print("best loss: ", best_model["best_loss"])
    return best_model


In [None]:
def ffn_evaluate(model):
# plot losses
    plt.plot(model["training_losses"], label="Training Loss")
    # print(model["training_losses"])
    plt.plot(model["valid_losses"], label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

    # calculate accuracy
    d = {"train": train_ds, "test": test_ds, "validation": valid_ds}
    for dataset in d:
        print(f"Evaluating **{dataset}** dataset:")
        mean_accuracy, class_accuracy, classifier_scores, confusion_matrix = model["model"].score(d[dataset], model["bs"])
        print(f"Mean Accuracy: {mean_accuracy*100:.3f}")
        print(f"Mean per-class accuracy:")
        for key in class_accuracy:
            print(f"  {'delayed' if key==1 else 'on time'}{': '}{class_accuracy[key]*100:.3f}%")
        print(f"Precision: {classifier_scores[0]}")
        print(f"Recall: {classifier_scores[1]}")
        print(f"F-Beta Score: {classifier_scores[2]}")
        print(f"F1 Score: {classifier_scores}")
        print(confusion_matrix)
        print()

In [None]:
# run model tuning and evaluation on the 4 combos of layers/nodes
def run_model(num_layers: int, num_nodes: int):
    params = {"bs":(64,),
            "epoch":(50,),
            "learning_rate":(.01,)}
    print(f"{num_nodes} Nodes, {num_layers} Hidden Layer(s)")
    best_model = ffn_tune(num_layers,num_nodes, params)
    print("best batch size: ", best_model["bs"])
    print("best epoch: ", best_model["epoch"])
    print("best learning rate: ", best_model["learning_rate"])
    ffn_evaluate(best_model)

    # TODO save model
    # state = best_model["model"].state_dict() # save the model

for pair in [(1,4)]:
    run_model(pair[0],pair[1])

# bs, epoch, learning_rate, momentum, activation function, number layers, number of nodes per hidden layer