In [1]:
import pandas as pd
import numpy as np
from scipy.stats import stats
from constants import CONTINUOUS_COLUMNS, NOMINAL_COLUMNS, DISCRETE_COLUMNS
pd.set_option("future.no_silent_downcasting", True)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from encoder import manual_df_encode

In [2]:
data_path = './Data/Clean_Dataset.csv'

df = pd.read_csv(data_path)
df.drop(columns=['Unnamed: 0'], inplace=True)
df.drop(columns=['flight'], inplace=True)

In [3]:
df.isna().sum()

airline             0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [4]:
outlier_info = {}
zscore_info = {}
for col in CONTINUOUS_COLUMNS:
    # Calculate Z-score values for the column
    df[col + '_zscore'] = stats.zscore(df[col])

    # Assuming that outliers are indicated by absolute Z-scores greater than 3
    outlier_indices = df[abs(df[col + '_zscore']) > 3].index

    # Replace outliers with the median of the column
    mean_value = df[col].mean()
    outlier_info[col] = {'outlier_replacement': mean_value, 'outlier_indices': list(outlier_indices)}

    df.loc[outlier_indices, col] = mean_value

    # Drop the Z-score column as it's no longer needed
    df.drop(columns=[col + '_zscore'], inplace=True)

  df[col + '_zscore'] = stats.zscore(df[col])
  df[col + '_zscore'] = stats.zscore(df[col])
  df.loc[outlier_indices, col] = mean_value


In [7]:
onehot_encoders = {}
new_columns = []

for col in NOMINAL_COLUMNS:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # print("Type of OH encoder: ", type(encoder))
    new_data = encoder.fit_transform(df[col].to_numpy().reshape(-1, 1))

    new_columns.extend(encoder.get_feature_names_out([col]))

    new_df = pd.DataFrame(new_data, columns=encoder.get_feature_names_out([col]))
    df = pd.concat([df, new_df], axis=1)

    onehot_encoders[col] = encoder

df.drop(columns=NOMINAL_COLUMNS, inplace=True)

In [8]:
X = df.drop(columns=['price'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def mean_absolute_percentage_error(y_true, y_pred):
    return tf.reduce_mean(tf.abs((y_true - y_pred) / tf.clip_by_value(tf.abs(y_true), 1e-8, tf.float32.max))) * 100


early_stopping = EarlyStopping(patience=3, restore_best_weights=True, verbose=1)

optimizer = Adam(learning_rate=0.0005)

model = tf.keras.Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(100, activation='relu'),
    Dense(100, activation='relu'),
    #Dropout(0.3),
    Dense(100, activation='relu'),
    Dense(100, activation='relu'),
    #mape 15 with 4 internal hierarchies
    Dense(1, activation='linear')  # Linear activation for regression
])

model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mse', mean_absolute_percentage_error])
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping]) # , batch_size=64)


y_pred_test = model.predict(X_test, verbose=0)

y_test = y_test.values if isinstance(y_test, pd.Series) else y_test
y_pred_test = y_pred_test.flatten()

mape_test = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100
print("MAPE", mape_test)

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

mse = mean_squared_error(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

Epoch 1/100
[1m7504/7504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 667us/step - loss: 215863712.0000 - mean_absolute_percentage_error: 78.7915 - mse: 215863712.0000 - val_loss: 28512966.0000 - val_mean_absolute_percentage_error: 20.4348 - val_mse: 28512966.0000
Epoch 2/100
[1m7504/7504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 704us/step - loss: 27743878.0000 - mean_absolute_percentage_error: 21.4328 - mse: 27743878.0000 - val_loss: 25887454.0000 - val_mean_absolute_percentage_error: 20.0535 - val_mse: 25887454.0000
Epoch 3/100
[1m7504/7504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 667us/step - loss: 24509594.0000 - mean_absolute_percentage_error: 20.5844 - mse: 24509594.0000 - val_loss: 23961430.0000 - val_mean_absolute_percentage_error: 20.5703 - val_mse: 23961430.0000
Epoch 4/100
[1m7504/7504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 651us/step - loss: 23469360.0000 - mean_absolute_percentage_error: 19.6460 - mse: 23469360.0000 - v