In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("datathon_train.csv")

def get_time(string):
  return int(string[:2])
dataset['DEP_TIME'] = dataset['DEP_TIME_BLK'].apply(get_time)

In [3]:
def remove_collinear_features(x, threshold):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold. Removing collinear features can help a model 
        to generalize and improves the interpretability of the model.

    Inputs: 
        x: features dataframe
        threshold: features with correlations greater than this value are removed

    Output: 
        dataframe that contains only the non-highly-collinear features
    '''

    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    x = x.drop(columns=drops)

    return x

In [4]:
remove_collinear_features(dataset, 0.7)

ValueError: could not convert string to float: 'Raleigh-Durham International'

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

def clean_labels_encoder(list_of_labels, df):
    for label in list_of_labels:
        df[label] = le.fit_transform(df[label])
    return df

# clean the labels
list_of_labels = ['CARRIER_NAME', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT']
dataset = clean_labels_encoder(list_of_labels, dataset)

In [6]:
dataset_clean = dataset.drop(['Id', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'DEP_TIME_BLK','DEST'], axis = 1)
dataset_clean.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DEPARTING_AIRPORT,DISTANCE,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,MANUFACTURE_YEAR,NUMBER_OF_SEATS,CARRIER_NAME,...,PLANE_AGE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND,DEP_DELAY_NEW,IS_DELAYED,DEP_TIME
0,1,17,4,67,427,6,13,2014.0,76,6,...,5,268,0.01,0.0,0.0,49.0,4.7,2.0,1,18
1,2,27,3,77,2689,2,3,2002.0,162,9,...,17,143,0.21,0.0,0.0,64.0,12.75,0.0,0,22
2,8,14,3,41,1947,1,30,1996.0,199,5,...,23,210,0.0,0.0,0.0,74.0,8.5,0.0,0,6
3,3,16,6,51,488,3,23,2015.0,76,11,...,4,64,0.0,0.0,0.0,56.0,17.67,0.0,0,14
4,6,20,4,17,541,4,24,1998.0,128,1,...,21,209,0.19,0.0,0.0,89.0,9.17,79.0,1,15


In [7]:
dataset_clean.columns

Index(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DEPARTING_AIRPORT', 'DISTANCE',
       'SEGMENT_NUMBER', 'CONCURRENT_FLIGHTS', 'MANUFACTURE_YEAR',
       'NUMBER_OF_SEATS', 'CARRIER_NAME', 'AIRPORT_FLIGHTS_MONTH',
       'AIRLINE_FLIGHTS_MONTH', 'AIRLINE_AIRPORT_FLIGHTS_MONTH',
       'AVG_MONTHLY_PASS_AIRPORT', 'AVG_MONTHLY_PASS_AIRLINE',
       'CARGO_HANDLING', 'FLT_ATTENDANTS_PER_PASS', 'GROUND_SERV_PER_PASS',
       'PLANE_AGE', 'PREVIOUS_AIRPORT', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'AWND',
       'DEP_DELAY_NEW', 'IS_DELAYED', 'DEP_TIME'],
      dtype='object')

In [21]:
X_df = dataset_clean.sample(10000)
y = X_df['IS_DELAYED'].to_numpy()
New_X_df = X_df.drop(['DEP_DELAY_NEW', 'IS_DELAYED'], axis = 1)
X = New_X_df.to_numpy()

In [22]:
from sklearn.model_selection import train_test_split

# Split the data into 60% training, 20% validation, and 20% testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Split the remaining data into 50% validation and 50% testing
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [23]:
print(X_train.shape)

(6000, 26)


In [24]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to your training data and transform it
X_train_normalized = scaler.fit_transform(X_train)

# Transform the validation and test data using the same scaler
X_val_normalized = scaler.transform(X_val)
X_test_normalized = scaler.transform(X_test)




In [25]:
#shuffle the data
from sklearn.utils import shuffle
# Shuffle the training data
X_train_normalized, y_train = shuffle(X_train_normalized, y_train, random_state=42)

# Shuffle the validation data (if needed)
X_val_normalized, y_eval = shuffle(X_val_normalized, y_val, random_state=42)

# Shuffle the development data
X_dev_normalized, y_dev = shuffle(X_test_normalized, y_test, random_state=42)

In [26]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model

from keras.models import Sequential
from keras.layers import Dense

In [27]:
model_1 = Sequential()

model_1.add(Dense(50, activation='relu', input_shape=(26,)))

model_1.add(Dense(30, activation='relu'))

model_1.add(Dense(10, activation='relu'))

model_1.add(Dense(1, activation='sigmoid'))

model_1.summary()

model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 50)                1350      
                                                                 
 dense_9 (Dense)             (None, 30)                1530      
                                                                 
 dense_10 (Dense)            (None, 10)                310       
                                                                 
 dense_11 (Dense)            (None, 1)                 11        
                                                                 
Total params: 3,201
Trainable params: 3,201
Non-trainable params: 0
_________________________________________________________________


In [28]:
results1 = model_1.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_val, y_val))
y_pred_m1 = model_1.predict(X_test)
y_pred_m1 =(y_pred_m1 > 0.5)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30

KeyboardInterrupt: 