<a href="https://colab.research.google.com/github/ferdouszislam/Weather-WaterLevel-Prediction-ML/blob/main/Notebooks/brri-dataset/experimentations/randomForest_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# # the imports in this cell are required when running on local device
# import os, sys
# sys.path.append(os.path.join('..', '..'))
# from utils.applyML_util import train_regression, eval_regression
# from utils.featureSelection_util import (pearson_correlation_fs, 
#                                          seleckKBest_fs, selectSequential_fs)

In [2]:
# the imports in this cell are required when running from Cloud (Colab/Kaggle)
# before running on cloud you nee to upload the .py files 
# from 'Notebooks/utils' directory
from applyML_util import train_regression, eval_regression
from featureSelection_util import (pearson_correlation_fs, 
                                   seleckKBest_fs, selectSequential_fs)

**Random Forest Documentation link:** https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor

In [4]:
# global random seed
RAND_SEED = 42

# dictionary of hyper-parameters
n_estimators = [x for x in range(1, 51)] # 1 to 50
param_grid = {
    'n_estimators': n_estimators,
    'max_features': ['auto', 'sqrt']
}

## 1. Experimentation on the Weather Daily dataset

In [5]:
# Load the train dataset
weather_daily_train_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Weather-WaterLevel-Prediction-ML/main/Datasets/brri-datasets/final-dataset/train/brri-weather_train_regression.csv')

# Load the test set
weather_daily_test_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Weather-WaterLevel-Prediction-ML/main/Datasets/brri-datasets/final-dataset/test/brri-weather_test_regression.csv')

In [6]:
# Random Forest Model
model = RandomForestRegressor(random_state=RAND_SEED, n_jobs=-1)

# train model
model, selected_hyperparams, train_r2, train_mae, train_rmse = train_regression(model, param_grid, weather_daily_train_df, cls='Rainfall (mm)')
print(f'Selected hyperparameters: {selected_hyperparams}')
# performance on the train set
print(f'Train set performance: r2-score={train_r2}, mae={train_mae}, rmse={train_rmse}')

In [7]:
# test model
test_r2, test_mae, test_rmse = eval_regression(model, weather_daily_test_df, cls='Rainfall (mm)')
# performance on the test set
print(f'Test set performance: r2-score={test_r2}, mae={test_mae}, rmse={test_rmse}')

### 1.1 Apply Pearson Feature Selection to Daily Weather Dataset

In [8]:
# select features from the train dataset
weather_daily_fs_train_df, cols_to_drop = pearson_correlation_fs(weather_daily_train_df, 'Rainfall (mm)')

# keep only selected features on the test dataset
weather_daily_fs_test_df = weather_daily_test_df.drop(columns=cols_to_drop)

In [9]:
# Random Forest Model
model = RandomForestRegressor(random_state=RAND_SEED, n_jobs=-1)

# train model
model, selected_hyperparams, train_r2, train_mae, train_rmse = train_regression(model, param_grid, weather_daily_fs_train_df, cls='Rainfall (mm)')
print(f'Selected hyperparameters: {selected_hyperparams}')
# performance on the train set
print(f'Train set performance: r2-score={train_r2}, mae={train_mae}, rmse={train_rmse}')

In [10]:
# test model
test_r2, test_mae, test_rmse = eval_regression(model, weather_daily_fs_test_df, cls='Rainfall (mm)')
# performance on the test set
print(f'Test set performance: r2-score={test_r2}, mae={test_mae}, rmse={test_rmse}')

## 2. Experimentation on the Weather Weekly dataset

In [11]:
# Load the train dataset
weather_weekly_train_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Weather-WaterLevel-Prediction-ML/main/Datasets/brri-datasets/final-dataset/train/brri-weather_avg_train_regression.csv')

# Load the test set
weather_weekly_test_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Weather-WaterLevel-Prediction-ML/main/Datasets/brri-datasets/final-dataset/test/brri-weather_avg_test_regression.csv')

In [12]:
# Random Forest Model
model = RandomForestRegressor(random_state=RAND_SEED, n_jobs=-1)

# train model
model, selected_hyperparams, train_r2, train_mae, train_rmse = train_regression(model, param_grid, weather_weekly_train_df, cls='Rainfall (mm)')
print(f'Selected hyperparameters: {selected_hyperparams}')
# performance on the train set
print(f'Train set performance: r2-score={train_r2}, mae={train_mae}, rmse={train_rmse}')

In [13]:
# test model
test_r2, test_mae, test_rmse = eval_regression(model, weather_weekly_test_df, cls='Rainfall (mm)')
# performance on the test set
print(f'Test set performance: r2-score={test_r2}, mae={test_mae}, rmse={test_rmse}')

### 2.1 Apply Pearson Feature Selection to Weekly Weather Dataset

In [14]:
# select features from the train dataset
weather_weekly_fs_train_df, cols_to_drop = pearson_correlation_fs(weather_weekly_train_df, 'Rainfall (mm)')

# keep only selected features on the test dataset
weather_weekly_fs_test_df = weather_weekly_test_df.drop(columns=cols_to_drop)

In [15]:
# Random Forest Model
model = RandomForestRegressor(random_state=RAND_SEED, n_jobs=-1)

# train model
model, selected_hyperparams, train_r2, train_mae, train_rmse = train_regression(model, param_grid, weather_weekly_fs_train_df, cls='Rainfall (mm)')
print(f'Selected hyperparameters: {selected_hyperparams}')
# performance on the train set
print(f'Train set performance: r2-score={train_r2}, mae={train_mae}, rmse={train_rmse}')

In [16]:
# test model
test_r2, test_mae, test_rmse = eval_regression(model, weather_weekly_fs_test_df, cls='Rainfall (mm)')
# performance on the test set
print(f'Test set performance: r2-score={test_r2}, mae={test_mae}, rmse={test_rmse}')