In [None]:
# All the imports are imported here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau
import matplotlib.pyplot as plt

In [None]:
# This function deals with loading the data and doing some minor operations 
def load_and_preprocess_data(file_path, selected_columns):
    data = pd.read_csv(file_path)
    data = data[selected_columns]
    data = data.dropna()
    pressure_mean = data['Pressure'].mean()  # Calculate the mean of the Pressure column
    print("Mean Pressure ", pressure_mean)
    print(data[data['Pressure'] == 0].count())
    data.loc[data['Pressure'] == 0, 'Pressure'] = pressure_mean  # Replace all the rows with the mean value if the pressure is recorded as zero
    print(data[data['Pressure'] == 0].count())
    return data

In [None]:
# This function splits the data into feature set and target
def split_features_target(data, target_column):
    X = data.drop([target_column], axis=1)
    y = data[target_column]
    return X, y

In [None]:
def split_train_test(X, y, test_size=0.2, random_state=420):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [None]:
# Function to scale the data
def scale_data(X_train, X_test, y_train, y_test):
    feature_scaler = StandardScaler()
    X_train_scaled = feature_scaler.fit_transform(X_train)
    X_test_scaled = feature_scaler.transform(X_test)

    target_scaler = MinMaxScaler()
    y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
    y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

    return X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler

In [None]:
# The model is built in this block of code
def build_model(input_dim, hidden_layer_sizes=(64, 128, 128, 64, 32, 8), learning_rate_init=0.004):
    model = Sequential()
    model.add(Dense(hidden_layer_sizes[0], input_dim=input_dim, activation='relu'))
    for layer_size in hidden_layer_sizes[1:]:
        model.add(Dense(layer_size, activation='relu'))
    model.add(Dense(1))
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_init)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

In [None]:
#This class has a function on_epoch_end which stores the mse and r2  on the end of each epoch. 
class MetricsHistory(Callback):
    def __init__(self, X_test, y_test):
        super().__init__()
        self.X_test = X_test
        self.y_test = y_test
        self.mse_history = []
        self.r2_history = []
        self.y_pred = []

    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.X_test).flatten()
        self.y_pred = y_pred
        mse = mean_squared_error(self.y_test, y_pred)
        r2 = r2_score(self.y_test, y_pred)
        self.mse_history.append(mse)
        self.r2_history.append(r2)

In [None]:
def train_and_record_metrics(X_train_scaled, y_train, X_test_scaled, y_test, input_dim, hidden_layer_sizes, learning_rate):
    model = build_model(input_dim=input_dim, hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=learning_rate)# Initializimg the model with the given parameters
    metrics_history = MetricsHistory(X_test_scaled, y_test) # Making the object of the class MetricsHistory
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5, verbose=1) # It is used for adjusting the learningn rate with increasing epocs. It uses val_loss as its metrics by which it judges whether is it needed to change the learning rate or not
    model.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test),
              epochs=300, batch_size=256, callbacks=[metrics_history, lr_scheduler], verbose=0) # Training the model

    return metrics_history, model

In [None]:
# This function is used to find the starting value of each of the ranges of the various features of the feature set
def create_ranges(df, column):
    min_val = df[column].min()
    max_val = df[column].max()
    range_size = (max_val - min_val) / 3
    lower_bound = min_val + range_size
    upper_bound = lower_bound + range_size
    return lower_bound, upper_bound

In [None]:
# In this function, data in split into different categories based on the value in which the various observations of the feature lies.
def categorize_data(X_test, ranges):
    categories = {col: {'Lower': [], 'Midrange': [], 'Higher': []} for col in ranges}
    counts = {col: {'Lower': 0, 'Midrange': 0, 'Higher': 0} for col in ranges} # For counting the observations in different categories

    for idx, row in X_test.iterrows(): # This for loop is just segregating the observations in different categories based on the ranges receieved from the create ranges function
        for col, (lower_bound, upper_bound) in ranges.items():
            if row[col] <= lower_bound:
                categories[col]['Lower'].append(idx)
                counts[col]['Lower'] += 1
            elif row[col] <= upper_bound:
                categories[col]['Midrange'].append(idx)
                counts[col]['Midrange'] += 1
            else:
                categories[col]['Higher'].append(idx)
                counts[col]['Higher'] += 1

    for col, category_counts in counts.items():
        print(f"Feature: {col}")
        for category, count in category_counts.items():
            print(f"  {category}: {count} observations")

    return categories

In [None]:
# This function just evaluates the model on the different categories of data segregated
def evaluate_model_on_categories(model, X_test, X_test_scaled, y_test_scaled, categories):
    results = {col: {'Lower': [], 'Midrange': [], 'Higher': []} for col in categories}
    summary_data = []

    for col, ranges in categories.items():
        for category in ['Lower', 'Midrange', 'Higher']:
            indices = ranges[category]
            if indices:
                X_cat = X_test_scaled[X_test.index.get_indexer(indices), :]
                y_cat_scaled = y_test_scaled[X_test.index.get_indexer(indices)]
                y_pred_cat = model.predict(X_cat).flatten()
                mse = mean_squared_error(y_cat_scaled, y_pred_cat)
                r2 = r2_score(y_cat_scaled, y_pred_cat)
                results[col][category] = (mse, r2)
                summary_data.append({'Feature': col, 'Category': category, 'MSE': mse, 'R2': r2})

    summary_df = pd.DataFrame(summary_data)
    print(summary_df)
    return results

In [None]:
# This function is plotting the histograms for the three categories of data for each of the mentioned features
def plot_histograms(results):
    for col, metrics in results.items():
        categories = list(metrics.keys())
        mse_values = [metrics[cat][0] for cat in categories]
        r2_values = [metrics[cat][1] for cat in categories]

        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        plt.bar(categories, mse_values, color=['blue', 'orange', 'green'])
        plt.xlabel('Category', fontsize=18)
        plt.ylabel('MSE', fontsize=18)
        plt.title(f'{col} - MSE', fontsize=18)

        plt.subplot(1, 2, 2)
        plt.bar(categories, r2_values, color=['blue', 'orange', 'green'])
        plt.xlabel('Category', fontsize=18)
        plt.ylabel('R² Score', fontsize=18)
        plt.title(f'{col} - R² Score', fontsize=18)

        plt.tight_layout()
        plt.show()

In [None]:
if __name__ == "__main__":
    selected_columns = ['Windspeed', 'Humidity', 'Temperature', 'Dewpoint', 'Pressure', 'Reading', 'Wind direction', 'Level']

    merged_df = load_and_preprocess_data('./Datasets/2021-Ballymun.csv', selected_columns) #Calling the function to do some preprocessing and loading the data
    original_X, original_y = split_features_target(merged_df, 'Level') # Splitting the data into features, target
    original_X_train, original_X_test, original_y_train, original_y_test = split_train_test(original_X, original_y) # Calling the function to split the data into training and testing data. 
    original_X_train_scaled, original_X_test_scaled, original_y_train_scaled, original_y_test_scaled, feature_scaler, target_scaler = scale_data(original_X_train, original_X_test, original_y_train, original_y_test) # Calling the function to scale the data. We have used StandardScaler and MinmaxScaler to scale the data. 
#  Defined the model parameters below

    learning_rate = 0.001
    hidden_layer_sizes = (64, 128, 128, 64)
    metrics_history, model = train_and_record_metrics(original_X_train_scaled, original_y_train_scaled, original_X_test_scaled,
                                                      original_y_test_scaled, original_X.shape[1], hidden_layer_sizes, learning_rate)

    columns_to_analyze = ['Wind direction', 'Windspeed','Humidity', 'Temperature']
    ranges = {col: create_ranges(original_X_test, col) for col in columns_to_analyze}
    categories = categorize_data(original_X_test, ranges)
    results = evaluate_model_on_categories(model, original_X_test, original_X_test_scaled, original_y_test_scaled, categories)
    plot_histograms(results)