In [None]:
import lightgbm
! pip install pandas scikit-learn

In [None]:
import pandas as pd

pd.set_option("future.no_silent_downcasting", True)

In [None]:
df = pd.read_csv('./Data/Clean_Dataset.csv')
df

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
! pip install ydata-profiling sweetviz

In [None]:
import os

directory = 'EDA_reports'
if not os.path.exists(directory):
    os.makedirs(directory)

from ydata_profiling import ProfileReport

if not os.path.exists(directory + '/airline_pricing_profile_report.html'):
    report = ProfileReport(df, title='Airline Pricing Profile Report')
    report.to_file(os.path.join(directory, 'airline_pricing_profile_report.html'))

In [None]:
import sweetviz as sv

if not os.path.exists(directory + '/airline_pricing_sweetviz_report.html'):
    report = sv.analyze(df)
    # report.to_file(os.path.join(directory, 'airline_pricing_sweetviz_report.html'))
    report.show_html(filepath=directory + '/airline_pricing_sweetviz_report.html')

In [None]:
original_df = df.copy(deep=True)
original_df

In [None]:
df['airline'] = df['airline'].replace({
    'Vistara': 1,
    'Air_India': 2,
    'Indigo': 3,
    'GO_FIRST': 4,
    'AirAsia': 5,
    'SpiceJet': 6
})
df['source_city'] = df['source_city'].replace({
    'Delhi': 1,
    'Mumbai': 2,
    'Bangalore': 3,
    'Kolkata': 4,
    'Hyderabad': 5,
    'Chennai': 6
})
df['departure_time'] = df['departure_time'].replace({
    'Morning': 1,
    'Early_Morning': 2,
    'Evening': 3,
    'Night': 4,
    'Afternoon': 5,
    'Late_Night': 6
})
df['arrival_time'] = df['arrival_time'].replace({
    'Night': 1,
    'Evening': 2,
    'Morning': 3,
    'Afternoon': 4,
    'Early_Morning': 5,
    'Late_Night': 6
})
df['destination_city'] = df['destination_city'].replace({
    'Mumbai': 1,
    'Delhi': 2,
    'Bangalore': 3,
    'Kolkata': 4,
    'Hyderabad': 5,
    'Chennai': 6
})
df['stops'] = df['stops'].replace({
    'zero': 1,
    'one': 2,
    'two_or_more': 3
})
df['class'] = df['class'].replace({
    'Economy': 1,
    'Business': 2
})
df.drop(columns=['flight'], inplace=True)
df

In [None]:
df.astype(float).describe()

In [None]:
df = df.astype(float)

In [None]:
from scipy import stats

continuous_columns = ['duration', 'days_left', 'price']

for col in continuous_columns:
    df[col + "_zscore"] = stats.zscore(df[col])
    outlier_indices = df[abs(df[col + "_zscore"]) < 3].index
    mean_value = df[col].mean()
    df.loc[outlier_indices, col] = mean_value
    df.drop(columns=[col + '_zscore'], inplace=True)


## Generate new reports with preprocessed data

In [None]:
if not os.path.exists(directory + '/airline_pricing_profile_report_PROCESSED.html'):
    report = ProfileReport(df, title='Airline Pricing Profile Report')
    report.to_file(os.path.join(directory, 'airline_pricing_profile_report_PROCESSED.html'))

if not os.path.exists(directory + '/airline_pricing_sweetviz_report_PROCESSED.html'):
    report = sv.analyze(df)
    report.show_html(filepath=directory + '/airline_pricing_sweetviz_report_PROCESSED.html')

## Modelling

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['price'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
import optuna
import sys
from contextlib import contextmanager
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso

In [None]:


linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

y_pred = linear_regression_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)

In [None]:
! pip install optuna

In [None]:
# If an algorithm produces too many warning messages, skip them with this:
@contextmanager
def suppress_stdout_stderr():
    """A context manager that redirects stdout and stderr to devnull"""
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = devnull
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr


def objective(trial, model_class, param_grid, X, y):
    params = {}
    for param, values in param_grid.items():
        if isinstance(values[0], int):
            params[param] = trial.suggest_int(param, min(values), max(values))
        elif isinstance(values[0], float):
            params[param] = trial.suggest_float(param, min(values), max(values))
        else:
            params[param] = trial.suggest_categorical(param, values)

    model_to_try = model_class(**params)
    score = cross_val_score(model_to_try, X, y, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
    return -score.mean()

In [None]:
if not os.path.exists("Training Scores"):
    os.makedirs("Training Scores")

models = {
    'Linear Regression': {
        'model': LinearRegression,
        'param_grid': {}
    },
    'Random Forest': {
        'model': RandomForestRegressor,
        'param_grid': {
            'n_estimators': [40, 60, 100],
            'max_depth': [10, 20, 30],
            'min_samples_split': [3, 5],
            'min_samples_leaf': [1, 2]
        }
    },
    #'Support Vector Machine': {        ## Takes way too long
    #    'model': SVR,
    #    'param_grid': {
    #        'C': [1, 10],
    #        'kernel': ['rbf', 'poly'],
    #        'gamma': ['scale', 'auto']
    #    }
    #},
    'Lasso': {
        'model': Lasso,
        'param_grid': {
            'alpha': [0.01, 0.1, 0.5, 1.0]
        }
    },
    'Ridge': {
        'model': Ridge,
        'param_grid': {
            'alpha': [0.01, 0.1, 0.5, 1.0],
        }
    },
    'LightGBM': {
        'model': lightgbm.LGBMRegressor,
        'param_grid': {
            'n_estimators': [10, 30, 60, 80, 100,
                             120, 140, 180, 200]
        }
    }
}

best_models = {}

X_train_subset = X_train  # [:10000]
y_train_subset = y_train  # [:10000]

if not os.path.exists('./Training Scores/Optuna_Scores.txt'):
    best_optuna_models = {}
    for model_name, model_info in models.items():
        model = model_info['model']
        param_grid = model_info['param_grid']

        print('NOW BEGINNING: ', model_name)
        study = optuna.create_study(direction='minimize',
                                    sampler=optuna.samplers.NSGAIISampler())
        study.optimize(
            lambda trial: objective(trial,
                                    model_info['model'],
                                    model_info['param_grid'],
                                    X_train_subset,
                                    y_train_subset),
            n_trials=100)
        best_optuna_models[model_name] = {'best_params': study.best_params, 'best_score': study.best_value}

    with open('./Training Scores/Optuna_Scores.txt', 'w') as f:
        f.write(str(best_optuna_models))


## Best optuna results

{'Linear Regression': {'best_params': {}, 'best_score': 26500973.916033458},
 'Random Forest': {'best_params': {'n_estimators': 63,
   'max_depth': 18,
   'min_samples_split': 5,
   'min_samples_leaf': 2},
  'best_score': 30022133.358639557},
 'Lasso': {'best_params': {'alpha': 0.9818794405133321},
  'best_score': 26499813.5938502},
 'Ridge': {'best_params': {'alpha': 0.9907901001005788},
  'best_score': 26499610.89094487},
 'LightGBM': {'best_params': {'n_estimators': 14},
  'best_score': 26536354.61283967}}

In [None]:
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'param_grid': {}
    },
    'Random Forest': {
        'model': RandomForestRegressor(),
        'param_grid': {
            'n_estimators': [40, 60, 100],
            'max_depth': [10, 20, 30],
            'min_samples_split': [3, 5],
            'min_samples_leaf': [1, 2]
        }
    },
    #'Support Vector Machine': {        ## Takes way too long
    #    'model': SVR,
    #    'param_grid': {
    #        'C': [1, 10],
    #        'kernel': ['rbf', 'poly'],
    #        'gamma': ['scale', 'auto']
    #    }
    #},
    'Lasso': {
        'model': Lasso(),
        'param_grid': {
            'alpha': [0.01, 0.1, 0.5, 1.0]
        }
    },
    'Ridge': {
        'model': Ridge(),
        'param_grid': {
            'alpha': [0.01, 0.1, 0.5, 1.0],
        }
    },
    'LightGBM': {
        'model': lightgbm.LGBMRegressor(),
        'param_grid': {
            'n_estimators': [10, 30, 60, 80, 100,
                             120, 140, 180, 200]
        }
    }
}
if not os.path.exists('./Training Scores/GridSearchCV_Scores.txt'):
    for model_name, model_info in models.items():
        model = model_info['model']
        param_grid = model_info['param_grid']

        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1,
                                   scoring='neg_mean_squared_error', verbose=2)

        grid_search.fit(X_train_subset, y_train_subset)

        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
        best_model = grid_search.best_estimator_

        # Store the best model and hyperparameters
        best_models[model_name] = {'best_params': best_params, 'best_model': best_model}

    X_test_subset = X_test  # [:10000]
    y_test_subset = y_test  # [:10000]
    highest_score = 0

    for model_name, model_info in best_models.items():
        best_model = model_info['best_model']
        y_pred_best = best_model.predict(X_test_subset)

        mae_best = mean_absolute_error(y_test_subset, y_pred_best)
        mse_best = mean_squared_error(y_test_subset, y_pred_best)
        r2_best = r2_score(y_test_subset, y_pred_best)

        print(f"{model_name} Metrics:")
        print(f"Mean Absolute Error (MAE): {mae_best:.2f}")
        print(f"Mean Squared Error (MSE): {mse_best:.2f}")
        print(f"R-squared (R2): {r2_best:.2f}")
        print(f"The parameters of the best model:\n{model_info['best_params']}")
        print()

    with open('./Training Scores/GridSearchCV_Scores.txt', 'w') as f:
        f.write(str(best_models))
        f.write(str(best_models))

Linear Regression Metrics:  
Mean Absolute Error (MAE): 4511.26  
Mean Squared Error (MSE): 50755644.13  
R-squared (R2): 0.90  
The parameters of the best model:  
{}

Random Forest Metrics:  
Mean Absolute Error (MAE): 2739.49  
Mean Squared Error (MSE): 23668748.21  
R-squared (R2): 0.95  
The parameters of the best model:  
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 40}  

Support Vector Machine Metrics:  
Mean Absolute Error (MAE): 6515.03  
Mean Squared Error (MSE): 133485858.30  
R-squared (R2): 0.74  
The parameters of the best model:  
{'C': 10, 'gamma': 'auto', 'kernel': 'poly'}  

Lasso Metrics:  
Mean Absolute Error (MAE): 4510.54  
Mean Squared Error (MSE): 50751417.74  
R-squared (R2): 0.90  
The parameters of the best model:  
{'alpha': 1.0}  

Ridge Metrics:  
Mean Absolute Error (MAE): 4511.25  
Mean Squared Error (MSE): 50755310.50  
R-squared (R2): 0.90  
The parameters of the best model:  
{'alpha': 0.01}  

LightGBM Metrics:  
Mean Absolute Error (MAE): 2725.81  
Mean Squared Error (MSE): 22106792.60  
R-squared (R2): 0.96  
The parameters of the best model:  
{'n_estimators': 60}  

## Training best final model

In [None]:
best_lightGBM_params = {
    'n_estimators': 60
}

best_lightGBM_model = lightgbm.LGBMRegressor(**best_lightGBM_params)
best_lightGBM_model.fit(X_train, y_train)

In [None]:
y_pred = best_lightGBM_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

Mean Absolute Error (MAE): 2451.6463877124097  
Mean Squared Error (MSE): 17319690.345978007  
R-squared (R2): 0.9664009972136046  

# TODO:
## - Optuna?
## - Check for duplicate data (not needed)
## - Anomaly Detection (won't be done)
## - Outlier Handling (DONE)
## - Neural Network Regression
## - dockerization (wasn't shown)

## - REST API building and testing

# Regression using neural networks

In [None]:
! pip install tensorflow

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam


def mean_absolute_percentage_error(y_true, y_pred):
    return tf.reduce_mean(tf.abs((y_true - y_pred) / tf.clip_by_value(tf.abs(y_true), 1e-8, tf.float32.max))) * 100


early_stopping = EarlyStopping(patience=5, restore_best_weights=True, verbose=1)

optimizer = Adam(learning_rate=0.0005)

model = tf.keras.Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(30, activation='relu'),
    #Dropout(0.3),
    Dense(30, activation='relu'),
    Dense(1, activation='linear')  # Linear activation for regression
])

model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mse', mean_absolute_percentage_error])
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping], batch_size=64)

In [None]:
# Plot the training history for MSE and MAPE
plt.figure(figsize=(8, 5))

plt.plot(history.history['mse'], label='Training MSE')
plt.plot(history.history['val_mse'], label='Validation MSE')


plt.title('Training and Validation Metrics')
plt.xlabel('Epochs')
plt.ylabel('Metrics')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot the training history for MSE and MAPE
plt.figure(figsize=(8, 5))

plt.plot(history.history['mean_absolute_percentage_error'], label='Training MAPE')
plt.plot(history.history['val_mean_absolute_percentage_error'], label='Validation MAPE')

plt.title('Training and Validation Metrics')
plt.xlabel('Epochs')
plt.ylabel('Metrics')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot the training history for MSE and MAPE
plt.figure(figsize=(8, 5))

plt.plot(history.history['mean_absolute_percentage_error'], label='Training MAPE')
plt.plot(history.history['val_mean_absolute_percentage_error'], label='Validation MAPE')

plt.title('Training and Validation Metrics')
plt.xlabel('Epochs')
plt.ylabel('Metrics')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import numpy as np

y_pred_test = model.predict(X_test, verbose=0)

y_test = y_test.values if isinstance(y_test, pd.Series) else y_test
y_pred_test = y_pred_test.flatten()

mape_test = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100
print("MAPE", mape_test)