In [None]:
# Importing the libraries here 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau
import matplotlib.pyplot as plt
import math

In [None]:
# This function deals with loading the data and doing some minor operations
def load_and_preprocess_data(file_path, selected_columns):
    data = pd.read_csv(file_path)
    data = data[selected_columns]
    data = data.dropna()
    pressure_mean = data['Pressure'].mean() # Calculate the mean of the Pressure column
    print("Mean Pressure ", pressure_mean)
    print(data[data['Pressure']==0].count())
    data.loc[data['Pressure'] == 0, 'Pressure'] = pressure_mean # Replace all the rows with the mean value if the pressure is recorded as zero
    print(data[data['Pressure']==0].count())
    data = data.sort_values(by='Timestamp').reset_index(drop=True) # Sort the data as per the timestamp and drop the timestamp column
    data['Timestamp'] = range(1, len(data) + 1) # Assign Numeric Values to the timestamp column
    return data 

In [None]:
# This function splits the data into feature set and target
def split_features_target(data, target_column):
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    return X, y

In [None]:
def split_train_test(X, y, test_size=0.3, random_state=420):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
# Function to scale the data
def scale_data(X_train, X_test, y_train, y_test):
    feature_scaler = StandardScaler()
    X_train_scaled = feature_scaler.fit_transform(X_train)
    X_test_scaled = feature_scaler.transform(X_test)

    target_scaler = MinMaxScaler()
    y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
    y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

    return X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler

In [None]:
# The model is built in this block of code
def build_model(input_dim, hidden_layer_sizes=(64, 128, 128, 64, 32, 8), learning_rate_init=0.004):
    model = Sequential()
    model.add(Dense(hidden_layer_sizes[0], input_dim=input_dim, activation='relu'))
    for layer_size in hidden_layer_sizes[1:]:
        model.add(Dense(layer_size, activation='relu'))
    model.add(Dense(1))
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_init)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

In [None]:
#This class has a function on_epoch_end which stores the mse, mae, r2 and rmse on the end of each epoch. 
class MetricsHistory(Callback):
    def __init__(self, X_test, y_test):
        super().__init__()
        self.X_test = X_test
        self.y_test = y_test
        self.mse_history = []
        self.r2_history = []
        self.mae_history = []
        self.rmse_history = []

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_test).flatten()
        mse = mean_squared_error(self.y_test, y_pred)
        mae = mean_absolute_error(self.y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(self.y_test, y_pred)
        self.mse_history.append(mse)
        self.mae_history.append(mae)
        self.rmse_history.append(rmse)
        self.r2_history.append(r2)

In [None]:
def train_and_record_metrics(X_train_scaled, y_train, X_test_scaled, y_test, input_dim, hidden_layer_sizes, learning_rate, batch_size):
    model = build_model(input_dim=input_dim, hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=learning_rate) # Initializimg the model with the given parameters
    metrics_history = MetricsHistory(X_test_scaled, y_test) # Making the object of the class MetricsHistory
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5, verbose=1) # It is used for adjusting the learningn rate with increasing epocs. It uses val_loss as its metrics by which it judges whether is it needed to change the learning rate or not
    model.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test),
              epochs=300, batch_size=batch_size, callbacks=[metrics_history, lr_scheduler], verbose=0) # Training the model

    return model, metrics_history

In [None]:
# This function is plotting the MSE, MAE, RMSE and R2 from the metrics_histories stored for each model trained with different type of input.
def plot_metrics(metrics_histories, labels):
    plt.figure(figsize=(20, 12))

    plt.subplot(2, 2, 1)
    for label, history in zip(labels, metrics_histories):
        plt.plot(history.mse_history, label=label, linewidth=2)
    plt.xlabel('Epoch', fontsize=18)
    plt.ylabel('MSE', fontsize=18)
    plt.title('MSE vs Epochs', fontsize=20)
    plt.legend(fontsize=25)

    plt.subplot(2, 2, 2)
    for label, history in zip(labels, metrics_histories):
        plt.plot(history.mae_history, label=label, linewidth=2)
    plt.xlabel('Epoch', fontsize=18)
    plt.ylabel('MAE', fontsize=18)
    plt.title('MAE vs Epochs', fontsize=20)
    plt.legend(fontsize=25)

    plt.subplot(2, 2, 3)
    for label, history in zip(labels, metrics_histories):
        plt.plot(history.rmse_history, label=label, linewidth=2)
    plt.xlabel('Epoch', fontsize=18)
    plt.ylabel('RMSE', fontsize=18)
    plt.title('RMSE vs Epochs', fontsize=20)
    plt.legend(fontsize=25)

    plt.subplot(2, 2, 4)
    for label, history in zip(labels, metrics_histories):
        plt.plot(history.r2_history, label=label, linewidth=2)
    plt.xlabel('Epoch', fontsize=18)
    plt.ylabel('R² Score', fontsize=18)
    plt.title('R² vs Epochs', fontsize=20)
    plt.legend(fontsize=25)

    plt.tight_layout()
    plt.show()


In [None]:
def ablation_layer_study(selected_columns, target_column='Level'):
    merged_df = load_and_preprocess_data('./Datasets/2022-Kippure.csv', selected_columns)#Calling the function to do some preprocessing and loading the data

    X, y = split_features_target(merged_df, target_column)# Splitting the data into features, target. X represents the feature columns and y represents the target columns.
    X_train, X_test, y_train, y_test = split_train_test(X, y) # Calling the function to split the data into training and testing data. 
    X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler = scale_data(X_train, X_test, y_train, y_test) # Calling the function to scale the data. We have used StandardScaler and MinmaxScaler to scale the data. 
    print("Training data shape:", X_train_scaled.shape)
    print("Testing data shape:", X_test_scaled.shape)
    #  Defined the model parameters below
    learning_rate = 0.001
    hidden_layer_configurations = [
        (64, 128, 128, 64, 32, 8),
        (64, 128, 128, 64, 32),
        (64, 128, 128, 64),
        (64, 128, 64),
        (64, 64),
    ]
    metrics_histories = []
    models = []
    labels = []
    results = []
    batch_size = 256
    # Loop for the feature ablation
    for hidden_layers in hidden_layer_configurations:
        model, metrics_history = train_and_record_metrics(X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, X.shape[1], hidden_layers, learning_rate, batch_size)
        metrics_histories.append(metrics_history)
        models.append(model)
        labels.append(f"{len(hidden_layers)} Hidden layers")

        # Calculate final metrics for the current configuration
        y_pred = model.predict(X_test_scaled).flatten()
        mse = mean_squared_error(y_test_scaled, y_pred)
        mae = mean_absolute_error(y_test_scaled, y_pred)
        r2 = r2_score(y_test_scaled, y_pred)
        results.append({
            "Hidden Layers": len(hidden_layers),
            "MSE": mse,
            "MAE": mae,
            "R²": r2
        })

    # Plot the metrics for the current configuration
    plot_metrics(metrics_histories, labels)    

    results_df = pd.DataFrame(results)
    print(results_df)

In [None]:
if __name__ == "__main__":
    selected_columns = ['Windspeed', 'Humidity', 'Temperature', 'Dewpoint', 'Pressure', 'Reading', 'Wind direction', 'Level']
    print("Performing Hidden Layer Ablation Study...")
    ablation_layer_study(selected_columns)