In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import gc
import sys
import json
import psutil  
from sqlalchemy import create_engine
from config import mypass


# Notebook options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#Data Paths
flights_ml_data = 'resources/flights_cleaned.csv'
airlines_dict = 'resources/airline_dict.json'
airports_dict = 'resources/airport_dict.json'

print(psutil.virtual_memory()) 

svmem(total=17137168384, available=10992021504, percent=35.9, used=6145146880, free=10992021504)


In [3]:
# Open dictionaries for encoding

with open("resources/airport_dict.json") as f:
    airport_dict = json.load(f)

    
with open("resources/airline_dict.json") as f:
    airline_dict = json.load(f)
    
print(len(airport_dict), len(airline_dict))

55 14


In [5]:
# number of Nodes testing


# Define the model
number_input_features = 123
hidden_nodes_layer1 = [90, number_input_features, number_input_features*2, ]
hidden_nodes_layer2 = [int(number_input_features/2), 90, number_input_features, number_input_features*2]
epochs = [3, 5, 10]



for node1 in hidden_nodes_layer1:
    for node2 in hidden_nodes_layer2:
        
        nn = tf.keras.models.Sequential()
        
        # First hidden layer
        nn.add(tf.keras.layers.Dense(units=node1, input_dim=number_input_features, activation="relu"))
        # Second hidden layer
        nn.add(tf.keras.layers.Dense(units=node2, activation="relu"))
        # Output layer
        nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

        # Compile the Sequential model together and customize metrics
        nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

        
        # Load data from resource file in chunks
        i = 1
        chunksize = 10** 6
        enc = OneHotEncoder(sparse=False, categories='auto')
        # Create a StandardScaler instance
        scaler = StandardScaler()


        # Proccess data, and train model in chuncks, saving/loading the model weights for each chunk

        for chunk in pd.read_csv(flights_ml_data, chunksize=chunksize):

            # Use uppercase for CSV chunks, lowercase for database chunks    

            y = chunk.DEPARTURE_DELAY
            X = chunk.drop(columns=['DEPARTURE_DELAY', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'Unnamed: 0'])

            # map string values to int using airport dict and airline dict
            for airport, id_num in airport_dict.items():
                X['ORIGIN_AIRPORT'].replace(airport, id_num, inplace=True)
                X['DESTINATION_AIRPORT'].replace(airport, id_num, inplace=True)

            for airline, id_num in airline_dict.items():
                X['AIRLINE'].replace(airline, id_num, inplace=True)

            # Encode categorical values, fit transform, get variable names, merge and remove old columns
            categorical_data = ['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIRLINE']

            encode_df = pd.DataFrame(enc.fit_transform(X[categorical_data]))
            encode_df.columns = enc.get_feature_names(categorical_data)

            # Merge encoded df and original df, remove pre-encoded columns
            # Resets the index, now have to drop added index column
            X = X.reset_index()  
            X.drop(columns=['index'], inplace=True)
            encode_df = encode_df.reset_index()
            encode_df.drop(columns=['index'], inplace=True)
            X = X.merge(encode_df, left_index=True, right_index=True)
            X.drop(columns=categorical_data, inplace=True)

            # Split into training and testing data
            X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, stratify=None)

            
            # Fit the StandardScaler
            X_scaler = scaler.fit(X_train)
            # Scale the data
            X_train_scaled = X_scaler.transform(X_train)
            X_test_scaled = X_scaler.transform(X_test)


            # Train the model
            fit_model = nn.fit(X_train_scaled, y_train, epochs=5, verbose = 0)

            # Evaluate the model using the test data
            model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=0)

            
            print(f'node 1: {node1},  node2: {node2},  mse: {model_loss}')
            
            i+=1
            
            # only test on first chunk otherwise this will take an entire day
            break
            

node 1: 90,  node2: 61,  mse: 390.13755495458986
node 1: 90,  node2: 90,  mse: 388.9505550966797
node 1: 90,  node2: 123,  mse: 389.5697041079102
node 1: 90,  node2: 246,  mse: 389.51734975634764
node 1: 123,  node2: 61,  mse: 389.9432755024414
node 1: 123,  node2: 90,  mse: 389.6478765073242
node 1: 123,  node2: 123,  mse: 389.0565438261719
node 1: 123,  node2: 246,  mse: 389.62237896435545
node 1: 246,  node2: 61,  mse: 388.89868199462893
node 1: 246,  node2: 90,  mse: 389.03143599316405
node 1: 246,  node2: 123,  mse: 390.33747130566405
node 1: 246,  node2: 246,  mse: 388.4041834975586


In [7]:
# Activation function testing

# Define the model
number_input_features = 123
hidden_nodes_layer1 = 123
hidden_nodes_layer2 = 123
activation_fns =['relu', 'exponential', 'linear', 'sigmoid', 'tanh']
epochs = [3, 5, 10]


for first in activation_fns:
    for second in activation_fns:

        nn = tf.keras.models.Sequential()

        # First hidden layer
        nn.add(tf.keras.layers.Dense(units=node1, input_dim=number_input_features, activation=first))
        # Second hidden layer
        nn.add(tf.keras.layers.Dense(units=node2, activation=second))
        # Output layer
        nn.add(tf.keras.layers.Dense(units=1, activation='linear'))

        # Compile the Sequential model together and customize metrics
        nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])


        # Load data from resource file in chunks
        i = 1
        chunksize = 10** 6
        enc = OneHotEncoder(sparse=False, categories='auto')
        # Create a StandardScaler instance
        scaler = StandardScaler()


        # Proccess data, and train model in chuncks, saving/loading the model weights for each chunk

        for chunk in pd.read_csv(flights_ml_data, chunksize=chunksize):

            # Use uppercase for CSV chunks, lowercase for database chunks    

            y = chunk.DEPARTURE_DELAY
            X = chunk.drop(columns=['DEPARTURE_DELAY', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'Unnamed: 0'])

            # map string values to int using airport dict and airline dict
            for airport, id_num in airport_dict.items():
                X['ORIGIN_AIRPORT'].replace(airport, id_num, inplace=True)
                X['DESTINATION_AIRPORT'].replace(airport, id_num, inplace=True)

            for airline, id_num in airline_dict.items():
                X['AIRLINE'].replace(airline, id_num, inplace=True)

            # Encode categorical values, fit transform, get variable names, merge and remove old columns
            categorical_data = ['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIRLINE']

            encode_df = pd.DataFrame(enc.fit_transform(X[categorical_data]))
            encode_df.columns = enc.get_feature_names(categorical_data)

            # Merge encoded df and original df, remove pre-encoded columns
            # Resets the index, now have to drop added index column
            X = X.reset_index()  
            X.drop(columns=['index'], inplace=True)
            encode_df = encode_df.reset_index()
            encode_df.drop(columns=['index'], inplace=True)
            X = X.merge(encode_df, left_index=True, right_index=True)
            X.drop(columns=categorical_data, inplace=True)

            # Split into training and testing data
            X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, stratify=None)


            # Fit the StandardScaler
            X_scaler = scaler.fit(X_train)
            # Scale the data
            X_train_scaled = X_scaler.transform(X_train)
            X_test_scaled = X_scaler.transform(X_test)


            # Train the model
            fit_model = nn.fit(X_train_scaled, y_train, epochs=5, verbose = 0)

            # Evaluate the model using the test data
            model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=0)

            print(f'first fn: {first},  second fn: {second},  mes: {model_loss}')

            i += 1
            # only test on first chunk otherwise this will take an entire day
            break

print("\nfinished")

first fn: relu,  second fn: relu,  mes: 389.44113661132815
first fn: relu,  second fn: exponential,  mes: 393.4043212265625
first fn: relu,  second fn: linear,  mes: 395.02030680664063
first fn: relu,  second fn: sigmoid,  mes: 390.0631171748047
first fn: relu,  second fn: tanh,  mes: 393.8799197163086
first fn: exponential,  second fn: relu,  mes: 394.9877534604492
first fn: exponential,  second fn: exponential,  mes: nan
first fn: exponential,  second fn: linear,  mes: 396.9606965751953
first fn: exponential,  second fn: sigmoid,  mes: 409.03629706884766
first fn: exponential,  second fn: tanh,  mes: 411.6163557392578
first fn: linear,  second fn: relu,  mes: 392.8749214116211
first fn: linear,  second fn: exponential,  mes: 409.30919636376956
first fn: linear,  second fn: linear,  mes: 403.09626523876955
first fn: linear,  second fn: sigmoid,  mes: 397.1821991333008
first fn: linear,  second fn: tanh,  mes: 398.61958030615233
first fn: sigmoid,  second fn: relu,  mes: 393.0199523769

In [10]:
# output activation function testing

# Define the model
number_input_features = 123
hidden_nodes_layer1 = 123
hidden_nodes_layer2 = 123
activation_fns =['relu', 'exponential', 'linear', 'sigmoid', 'tanh']
epochs = [3, 5, 10]


for output in activation_fns:


    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(tf.keras.layers.Dense(units=node1, input_dim=number_input_features, activation='relu'))
    # Second hidden layer
    nn.add(tf.keras.layers.Dense(units=node2, activation='relu'))
    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation=output))

    # Compile the Sequential model together and customize metrics
    nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])


    # Load data from resource file in chunks
    i = 1
    chunksize = 10** 6
    enc = OneHotEncoder(sparse=False, categories='auto')
    # Create a StandardScaler instance
    scaler = StandardScaler()


    # Proccess data, and train model in chuncks, saving/loading the model weights for each chunk

    for chunk in pd.read_csv(flights_ml_data, chunksize=chunksize):

        # Use uppercase for CSV chunks, lowercase for database chunks    

        y = chunk.DEPARTURE_DELAY
        X = chunk.drop(columns=['DEPARTURE_DELAY', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'Unnamed: 0'])

        # map string values to int using airport dict and airline dict
        for airport, id_num in airport_dict.items():
            X['ORIGIN_AIRPORT'].replace(airport, id_num, inplace=True)
            X['DESTINATION_AIRPORT'].replace(airport, id_num, inplace=True)

        for airline, id_num in airline_dict.items():
            X['AIRLINE'].replace(airline, id_num, inplace=True)

        # Encode categorical values, fit transform, get variable names, merge and remove old columns
        categorical_data = ['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIRLINE']

        encode_df = pd.DataFrame(enc.fit_transform(X[categorical_data]))
        encode_df.columns = enc.get_feature_names(categorical_data)

        # Merge encoded df and original df, remove pre-encoded columns
        # Resets the index, now have to drop added index column
        X = X.reset_index()  
        X.drop(columns=['index'], inplace=True)
        encode_df = encode_df.reset_index()
        encode_df.drop(columns=['index'], inplace=True)
        X = X.merge(encode_df, left_index=True, right_index=True)
        X.drop(columns=categorical_data, inplace=True)

        # Split into training and testing data
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, stratify=None)


        # Fit the StandardScaler
        X_scaler = scaler.fit(X_train)
        # Scale the data
        X_train_scaled = X_scaler.transform(X_train)
        X_test_scaled = X_scaler.transform(X_test)


        # Train the model
        fit_model = nn.fit(X_train_scaled, y_train, epochs=5, verbose = 0)

        # Evaluate the model using the test data
        model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=0)

        print(f'output fn: {output},  mes: {model_loss}, accuracy: {model_accuracy}')

        i += 1
        # only test on first chunk otherwise this will take an entire day
        break


output fn: relu,  mes: 391.30573986621096, loss: 0.04151200130581856
output fn: exponential,  mes: 393.5145560385742, loss: 0.03800800070166588
output fn: linear,  mes: 388.9225799580078, loss: 0.03823599964380264
output fn: sigmoid,  mes: 453.692526144043, loss: 0.03418000042438507
output fn: tanh,  mes: 453.938224, loss: 0.03214799985289574


In [12]:
# number of epochs testing


# Define the model
number_input_features = 123
hidden_nodes_layer1 = 123
hidden_nodes_layer2 = 123

epochs = [3, 5, 10, 15, 100]


for epoch in epochs:

    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
    # Second hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation="linear"))

    # Compile the Sequential model together and customize metrics
    nn.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])


    # Load data from resource file in chunks
    i = 1
    chunksize = 10** 6
    enc = OneHotEncoder(sparse=False, categories='auto')
    # Create a StandardScaler instance
    scaler = StandardScaler()


    # Proccess data, and train model in chuncks, saving/loading the model weights for each chunk

    for chunk in pd.read_csv(flights_ml_data, chunksize=chunksize):

        # Use uppercase for CSV chunks, lowercase for database chunks    

        y = chunk.DEPARTURE_DELAY
        X = chunk.drop(columns=['DEPARTURE_DELAY', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'Unnamed: 0'])

        # map string values to int using airport dict and airline dict
        for airport, id_num in airport_dict.items():
            X['ORIGIN_AIRPORT'].replace(airport, id_num, inplace=True)
            X['DESTINATION_AIRPORT'].replace(airport, id_num, inplace=True)

        for airline, id_num in airline_dict.items():
            X['AIRLINE'].replace(airline, id_num, inplace=True)

        # Encode categorical values, fit transform, get variable names, merge and remove old columns
        categorical_data = ['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'AIRLINE']

        encode_df = pd.DataFrame(enc.fit_transform(X[categorical_data]))
        encode_df.columns = enc.get_feature_names(categorical_data)

        # Merge encoded df and original df, remove pre-encoded columns
        # Resets the index, now have to drop added index column
        X = X.reset_index()  
        X.drop(columns=['index'], inplace=True)
        encode_df = encode_df.reset_index()
        encode_df.drop(columns=['index'], inplace=True)
        X = X.merge(encode_df, left_index=True, right_index=True)
        X.drop(columns=categorical_data, inplace=True)

        # Split into training and testing data
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, stratify=None)


        # Fit the StandardScaler
        X_scaler = scaler.fit(X_train)
        # Scale the data
        X_train_scaled = X_scaler.transform(X_train)
        X_test_scaled = X_scaler.transform(X_test)


        # Train the model
        fit_model = nn.fit(X_train_scaled, y_train, epochs=epoch, verbose = 0)

        # Evaluate the model using the test data
        model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=0)

        print(f'epochs: {epoch},  mse: {model_loss}, accuracy: {model_accuracy}')

        i+=1


        # only test on first chunk otherwise this will take an entire day
        break


epochs: 3,  mse: 392.5247253364258, accuracy: 0.03680000081658363
epochs: 5,  mse: 389.0853060766602, accuracy: 0.03821200132369995
epochs: 10,  mse: 388.5565029580078, accuracy: 0.03883599862456322
epochs: 15,  mse: 387.64053872558594, accuracy: 0.037675999104976654
epochs: 100,  mse: 399.87182602685544, accuracy: 0.03789199888706207
