In [39]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
import sys
from model import NLTK_Binary_Classifier
from sklearn.model_selection import train_test_split
import os 

In [40]:
df = pd.read_csv('data/raw/small.csv')

In [41]:
""" Declare categorial and numerical features. Filter df down to these features and 
    case_id and target 
"""

# Read in variables from features.csv, which contains the categorical and numerical features
f_df = pd.read_csv('data/features.csv')
cat_f = f_df[f_df['type'] == 'categorical']['name'].tolist()
num_f = f_df[f_df['type'] == 'numerical']['name'].tolist()
target = f_df[f_df['type'] == 'target']['name'].tolist()
print(f'my target is {target}')
display(f"categorical features: {cat_f}")
display(f"numerical features: {num_f}")
col_to_keep = target + cat_f + num_f
df = df[col_to_keep]
print(f"df columns: {df.columns}")
# df.to_csv('test/TEST0.csv', index=False)

# FILL IN MISSING VALUES
# For categorical variables, take the most recent value
for col in cat_f:
    try: 
        common_val = df[col].value_counts().idxmax()
    except ValueError: 
        common_val = 0
    df[col] = df[col].fillna(common_val)

# FILL IN MISSING VALUES
# round to 2 decimal places. Replace missing numerical values with the mean
df.loc[:, num_f] = df[num_f].round(2)
df.loc[:, num_f] = df[num_f].fillna(df[num_f].mean())

# For missing target values, drop the row
df = df.dropna(subset=target)
# change the dtype of target to 8-bit integer
df[target] = df[target].astype(np.int8)

# one-hot encode the categorical variables
df = pd.get_dummies(df, columns=cat_f, dtype=np.int8)

# standardize the continuous variables
scaler = StandardScaler()
df[num_f] = scaler.fit_transform(df[num_f]) 

# Save the processed dataframe 
df.to_csv('data/processed/processed.csv', index=False)

my target is ['ARR_DEL15']


"categorical features: ['MONTH', 'OP_CARRIER_AIRLINE_ID', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']"

"numerical features: ['DEP_DELAY', 'TAXI_OUT', 'DISTANCE']"

df columns: Index(['ARR_DEL15', 'MONTH', 'OP_CARRIER_AIRLINE_ID', 'ORIGIN_AIRPORT_ID',
       'DEST_AIRPORT_ID', 'DEP_DELAY', 'TAXI_OUT', 'DISTANCE'],
      dtype='object')


In [42]:
""" 
Now that the data is processed, we need to balance the classes 
Randomly downsample the majority class to match the minority class
"""

df = pd.read_csv('data/processed/processed.csv')

minority_class = df[df[target[0]] == 1]
majority_class = df[df[target[0]] == 0]

# Downsample the majority class
majority_class = majority_class.sample(n=len(minority_class))

# Concatenate the minority and majority classes
df = pd.concat([minority_class, majority_class])
df.to_csv('data/processed/balanced.csv', index=False)

In [46]:

def splitTrainingData(df, featureCols, targetCol, random=False):
    state = 42 if random else None
    X = df[featureCols]
    X = X.drop(columns=targetCol)
    # X = X.drop(columns='DEP_DELAY')
    y = df[targetCol]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state)
    return X_train, X_test, y_train, y_test


def main():
    df = pd.read_csv('data/processed/balanced.csv')

    columns = df.columns
    X_train, X_test, y_train, y_test = splitTrainingData(df, columns, target)
    # save X_train columns to a file
    with open('data/processed/X_train_columns.txt', 'w') as f:
        for col in X_train.columns:
            f.write(col + '\n')
    model = NLTK_Binary_Classifier()
    model.compile()

    # Perform some fancy graphs if needed
    history = model.fit(X_train, y_train, epochs=6, batch_size=64)
    
    model.model.save('model/model.keras')


    model.reset_weights()


main()

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
