In [None]:
# Importing the necessary libraries here 
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import HuberRegressor, LassoCV, ElasticNetCV, LassoLarsCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from math import sqrt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LassoLarsIC, LarsCV, Lars, RANSACRegressor, ElasticNet, Lasso, OrthogonalMatchingPursuitCV, PassiveAggressiveRegressor, OrthogonalMatchingPursuit, LassoLars
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.dummy import DummyRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import RidgeCV, BayesianRidge, Ridge

In [None]:
# This function deals with loading the data and doing some minor operations
def load_and_preprocess_data(file_path, selected_columns):
    data = pd.read_csv(file_path)
    data = data[selected_columns]
    data = data.dropna()
    pressure_mean = data['Pressure'].mean()  # Calculate the mean of the Pressure column
    print("Mean Pressure ", pressure_mean)
    print(data[data['Pressure'] == 0].count())
    data.loc[data['Pressure'] == 0, 'Pressure'] = pressure_mean  # Replace all the rows with the mean value if the pressure is recorded as zero
    print(data[data['Pressure'] == 0].count())
    return data

In [None]:
# This functions splits the data into feature and target
def split_features_target(data, target_column):
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    return X, y

In [None]:
# This function splits the data in training and testing
def split_train_test(X, y, test_size=0.3, random_state=420):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
# This function is used to scale the data
def scale_data(X_train, X_test, y_train, y_test):
    feature_scaler = StandardScaler()
    X_train_scaled = feature_scaler.fit_transform(X_train)
    X_test_scaled = feature_scaler.transform(X_test)

    target_scaler = MinMaxScaler()
    y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
    y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

    return X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler

In [None]:
selected_columns = ['Windspeed', 'Humidity', 'Temperature', 'Dewpoint', 'Pressure', 'Reading', 'Wind direction', 'Level']

merged_df = load_and_preprocess_data('./Datasets/2021-Kippure.csv', selected_columns) # Calling the function to create a dataframe with the above mentioned columns
    
X, y = split_features_target(merged_df, 'Level') # Calling the function to split the data into features and target

X_train, X_test, y_train, y_test = split_train_test(X, y) # Splitting the data into training and testing

X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler = scale_data(X_train, X_test, y_train, y_test)

In [None]:
# Defining a dictionary containing the different models.
models = {
    "Extra Trees Regressor": ExtraTreesRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "HistGradient Boosting Regressor": HistGradientBoostingRegressor(),
    "K Neighbors Regressor": KNeighborsRegressor(),
    "Bagging Regressor": BaggingRegressor(),
    "Huber Regressor": HuberRegressor(),
    "Transformed Target Regressor": TransformedTargetRegressor(regressor=HuberRegressor(), transformer=StandardScaler()),
    "LassoCV": LassoCV(),
    "ElasticNetCV": ElasticNetCV(),
    "LassoLarsCV": LassoLarsCV(),
    "LassoLarsIC": LassoLarsIC(),
    "LarsCV": LarsCV(),
    "Lars": Lars(),
    "RANSAC Regressor": RANSACRegressor(),
    "ElasticNet": ElasticNet(),
    "Lasso": Lasso(),
    "Orthogonal Matching Pursuit CV": OrthogonalMatchingPursuitCV(),
    "Orthogonal Matching Pursuit": OrthogonalMatchingPursuit(),
    "Dummy Regressor": DummyRegressor(),
    "Lasso Lars": LassoLars(),
}




In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred_scaled = model.predict(X_test)
    
    # Inverse transform to get original scale predictions
    y_pred = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
    y_test_original = target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
    
    # Evaluation metrics on the scaled dataset
    r2 = r2_score(y_test_scaled, y_pred_scaled)
    mse = mean_squared_error(y_test_scaled, y_pred_scaled)
    rmse = np.sqrt(mse)
    
    return rmse, mse, r2

In [None]:
results = {}
c = 0
for name, model in models.items():
    c+= 1
    rmse, mse, r2 = evaluate_model(model, X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled)
    results[name] = {"RMSE": rmse, "MSE": mse, "R2": r2}
    print(c)

results_df = pd.DataFrame(results).T
print(results_df)