In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
from sklearn.model_selection import GridSearchCV
from math import sqrt

In [None]:
# This function deals with loading the data and doing some minor operations
def load_and_preprocess_data(file_path, selected_columns):
    data = pd.read_csv(file_path)
    data = data[selected_columns]
    data = data.dropna()
    pressure_mean = data['Pressure'].mean()  # Calculate the mean of the Pressure column
    print("Mean Pressure ", pressure_mean)
    print(data[data['Pressure'] == 0].count())
    data.loc[data['Pressure'] == 0, 'Pressure'] = pressure_mean  # Replace all the rows with the mean value if the pressure is recorded as zero
    print(data[data['Pressure'] == 0].count())
    return data

In [None]:
# This function splits the data into feature set and target
def split_features_target(data, target_column):
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    return X, y

In [None]:

def split_train_test(X, y, test_size=0.3, random_state=420):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
# Function to scale the data
def scale_data(X_train, X_test, y_train, y_test):
    feature_scaler = StandardScaler()
    X_train_scaled = feature_scaler.fit_transform(X_train)
    X_test_scaled = feature_scaler.transform(X_test)

    target_scaler = MinMaxScaler()
    y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
    y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

    return X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler

In [None]:
selected_columns = ['Windspeed', 'Humidity', 'Temperature', 'Dewpoint', 'Pressure', 'Reading', 'Wind direction', 'Level']
merged_df = load_and_preprocess_data('./Datasets/2023-Kippure.csv', selected_columns)
X, y = split_features_target(merged_df, 'Level')
X_train, X_test, y_train, y_test = split_train_test(X, y)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler = scale_data(X_train, X_test, y_train, y_test)

In [None]:
# You can define the various parameters below in the param_grid. The grid search CV will take out the best combination of the parameter for your dataset
param_grid = {
    'n_estimators': [50,75,100,150,200],
    'max_depth': [12,18,24],
    'min_samples_split': [15,20],
    'min_samples_leaf': [12,18, 22],
    'max_features': [None]
    # 'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Set up the grid search
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, n_jobs=1, scoring='neg_mean_squared_error')

# Convert to NumPy arrays to ensure compatibility
X_train_np = np.array(X_train_scaled)
y_train_np = np.array(y_train_scaled)

# Fit the grid search to the data
grid_search.fit(X_train_np, y_train_np)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Train the model with the best parameters
best_rf_regressor = RandomForestRegressor(**best_params, random_state=42)
best_rf_regressor.fit(X_train_np, y_train_np)

# Convert test data to NumPy arrays
X_test_np = np.array(X_test_scaled)
y_test_np = np.array(y_test_scaled)

# Make predictions
y_pred = best_rf_regressor.predict(X_test_np)


In [None]:
# Evaluate the model
mse = mean_squared_error(y_test_scaled, y_pred)
r2 = r2_score(y_test_scaled, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")