In [1]:
# the imports in this cell are required when running from Cloud (Colab/Kaggle)
# before running on cloud you nee to upload the .py files 
# from 'Notebooks/utils' directory
from applyML_util import train_regression, eval_regression, showEvalutationGraph_regression
from featureSelection_util import (pearson_correlation_fs, 
                                   seleckKBest_fs, selectSequential_fs)

# New Section

**Random Forest Documentation link:** https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

**XGBoost Documentation link:** https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier

**SK-Lego Documentation link:** https://scikit-lego.netlify.app/meta.html

In [None]:
!pip install sklego

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklego.meta import ZeroInflatedRegressor #!pip install sklego
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestRegressor


In [4]:
# global random seed
RAND_SEED = 42

ziReg = ZeroInflatedRegressor(classifier=XGBClassifier(random_state=RAND_SEED,tree_method='hist'), regressor=RandomForestRegressor(random_state=RAND_SEED, n_jobs=-1))

# initial model with only random seed and not any hyper-parametes
initial_model = ziReg

# hyper-parameters


#RFR
n_estimators = [x*5 for x in range(20, 41)] 
max_features = ['sqrt']
#XGB
subsample = [0.5]


param_grid = { 'classifier__subsample' :subsample,
              'classifier__n_estimators': n_estimators,
              'regressor__max_features': max_features,
              'regressor__n_estimators': n_estimators
              }

# variables needed for showEvalGraph_regression() function
# MODEL_CLASS = ziReg
# x_axis_param_name = 'regressor__C'
# x_axis_vals = regressor__C

In [5]:
ziReg.get_params().keys()

dict_keys(['classifier__base_score', 'classifier__booster', 'classifier__colsample_bylevel', 'classifier__colsample_bynode', 'classifier__colsample_bytree', 'classifier__gamma', 'classifier__learning_rate', 'classifier__max_delta_step', 'classifier__max_depth', 'classifier__min_child_weight', 'classifier__missing', 'classifier__n_estimators', 'classifier__n_jobs', 'classifier__nthread', 'classifier__objective', 'classifier__random_state', 'classifier__reg_alpha', 'classifier__reg_lambda', 'classifier__scale_pos_weight', 'classifier__seed', 'classifier__silent', 'classifier__subsample', 'classifier__verbosity', 'classifier__tree_method', 'classifier', 'regressor__bootstrap', 'regressor__ccp_alpha', 'regressor__criterion', 'regressor__max_depth', 'regressor__max_features', 'regressor__max_leaf_nodes', 'regressor__max_samples', 'regressor__min_impurity_decrease', 'regressor__min_samples_leaf', 'regressor__min_samples_split', 'regressor__min_weight_fraction_leaf', 'regressor__n_estimators'

## 1. Experimentation on the Weather Daily dataset

In [6]:
# Load the train dataset
weather_daily_train_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Weather-WaterLevel-Prediction-ML/main/Datasets/brri-datasets/final-dataset/train/brri-weather_train_regression.csv')

# Load the test set
weather_daily_test_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Weather-WaterLevel-Prediction-ML/main/Datasets/brri-datasets/final-dataset/test/brri-weather_test_regression.csv')

In [7]:
# train model
model, selected_hyperparams, train_r2, train_mae, train_rmse = train_regression(initial_model, param_grid, weather_daily_train_df, cls='Rainfall (mm)')
print(f'Selected hyperparameters: {selected_hyperparams}')
# performance on the train set
print(f'Train set performance: r2-score={train_r2}, mae={train_mae}, rmse={train_rmse}')

Selected hyperparameters: {'classifier__n_estimators': 130, 'classifier__subsample': 0.5, 'regressor__max_features': 'sqrt', 'regressor__n_estimators': 100}
Train set performance: r2-score=0.2419, mae=5.5242, rmse=12.9599


In [8]:
# # r2-scores graph on the train set

# # hyper-parameters selected by GridSearchCV
# selected_model_params = selected_hyperparams
# #selected_model_params['random_state'] = RAND_SEED

# showEvalutationGraph_regression(MODEL_CLASS, weather_daily_train_df, cls='Rainfall (mm)', 
#                                 x_axis_param_name=x_axis_param_name, x_axis_param_vals=x_axis_vals, 
#                                 selected_model_params=selected_model_params)

In [9]:
# test model
test_r2, test_mae, test_rmse = eval_regression(model, weather_daily_test_df, cls='Rainfall (mm)')
# performance on the test set
print(f'Test set performance: r2-score={test_r2}, mae={test_mae}, rmse={test_rmse}')

Test set performance: r2-score=0.1561, mae=5.9087, rmse=15.8841


### 1.1 Apply Pearson Feature Selection to Daily Weather Dataset

In [10]:
# select features from the train dataset
weather_daily_fs1_train_df, cols_to_drop = pearson_correlation_fs(weather_daily_train_df, 'Rainfall (mm)')

# keep only selected features on the test dataset
weather_daily_fs1_test_df = weather_daily_test_df.drop(columns=cols_to_drop)

dropping Sunshine (hour/day) from (Cloudy (hour/day), Sunshine (hour/day))


In [11]:
# train model
model, selected_hyperparams, train_r2, train_mae, train_rmse = train_regression(initial_model, param_grid, weather_daily_fs1_train_df, cls='Rainfall (mm)')
print(f'Selected hyperparameters: {selected_hyperparams}')
# performance on the train set
print(f'Train set performance: r2-score={train_r2}, mae={train_mae}, rmse={train_rmse}')

Selected hyperparameters: {'classifier__n_estimators': 175, 'classifier__subsample': 0.5, 'regressor__max_features': 'sqrt', 'regressor__n_estimators': 180}
Train set performance: r2-score=0.246, mae=5.5346, rmse=12.936


In [12]:
# # r2-scores graph on the train set

# # hyper-parameters selected by GridSearchCV
# selected_model_params = selected_hyperparams
# #selected_model_params['random_state'] = RAND_SEED

# showEvalutationGraph_regression(MODEL_CLASS, weather_daily_fs1_train_df, cls='Rainfall (mm)', 
#                                 x_axis_param_name=x_axis_param_name, x_axis_param_vals=x_axis_vals, 
#                                 selected_model_params=selected_model_params)

In [13]:
# test model
test_r2, test_mae, test_rmse = eval_regression(model, weather_daily_fs1_test_df, cls='Rainfall (mm)')
# performance on the test set
print(f'Test set performance: r2-score={test_r2}, mae={test_mae}, rmse={test_rmse}')

Test set performance: r2-score=0.1692, mae=5.8022, rmse=15.7602


### 1.2 Apply SelectKBest Feature Selection to Daily Weather Dataset

In [14]:
# select features from the train dataset
weather_daily_fs2_train_df, cols_to_drop = seleckKBest_fs(weather_daily_train_df, 'Rainfall (mm)', is_regression=True)

print('features dropped:', cols_to_drop)

# keep only selected features on the test dataset
weather_daily_fs2_test_df = weather_daily_test_df.drop(columns=cols_to_drop)

features dropped: ['Max Temp. (degree Celcius)', 'Solar Radiation (cal/cm^2/day)']


In [15]:
# train model
model, selected_hyperparams, train_r2, train_mae, train_rmse = train_regression(initial_model, param_grid, weather_daily_fs2_train_df, cls='Rainfall (mm)')
print(f'Selected hyperparameters: {selected_hyperparams}')
# performance on the train set
print(f'Train set performance: r2-score={train_r2}, mae={train_mae}, rmse={train_rmse}')

Selected hyperparameters: {'classifier__n_estimators': 110, 'classifier__subsample': 0.5, 'regressor__max_features': 'sqrt', 'regressor__n_estimators': 180}
Train set performance: r2-score=0.2311, mae=5.5123, rmse=13.0823


In [16]:
# # r2-scores graph on the train set

# # hyper-parameters selected by GridSearchCV
# selected_model_params = selected_hyperparams
# #selected_model_params['random_state'] = RAND_SEED

# showEvalutationGraph_regression(MODEL_CLASS, weather_daily_fs2_train_df, cls='Rainfall (mm)', 
#                                 x_axis_param_name=x_axis_param_name, x_axis_param_vals=x_axis_vals, 
#                                 selected_model_params=selected_model_params)

In [17]:
# test model
test_r2, test_mae, test_rmse = eval_regression(model, weather_daily_fs2_test_df, cls='Rainfall (mm)')
# performance on the test set
print(f'Test set performance: r2-score={test_r2}, mae={test_mae}, rmse={test_rmse}')

Test set performance: r2-score=0.1462, mae=5.9316, rmse=15.9771


### 1.3 Apply SelectSequential Feature Selection to Daily Weather Dataset

In [18]:
# select features from the train dataset
weather_daily_fs3_train_df, cols_to_drop = selectSequential_fs(weather_daily_train_df, 'Rainfall (mm)', is_regression=True)

print('features dropped:', cols_to_drop)

# keep only selected features on the test dataset
weather_daily_fs3_test_df = weather_daily_test_df.drop(columns=cols_to_drop)

features dropped: ['Actual Evaporation (mm)', 'Relative Humidity (morning, %)']


In [19]:
# train model
model, selected_hyperparams, train_r2, train_mae, train_rmse = train_regression(initial_model, param_grid, weather_daily_fs3_train_df, cls='Rainfall (mm)')
print(f'Selected hyperparameters: {selected_hyperparams}')
# performance on the train set
print(f'Train set performance: r2-score={train_r2}, mae={train_mae}, rmse={train_rmse}')

Selected hyperparameters: {'classifier__n_estimators': 120, 'classifier__subsample': 0.5, 'regressor__max_features': 'sqrt', 'regressor__n_estimators': 165}
Train set performance: r2-score=0.1882, mae=5.8003, rmse=13.4055


In [20]:
# # r2-scores graph on the train set

# # hyper-parameters selected by GridSearchCV
# selected_model_params = selected_hyperparams
# #selected_model_params['random_state'] = RAND_SEED

# showEvalutationGraph_regression(MODEL_CLASS, weather_daily_fs3_train_df, cls='Rainfall (mm)', 
#                                 x_axis_param_name=x_axis_param_name, x_axis_param_vals=x_axis_vals, 
#                                 selected_model_params=selected_model_params)

In [21]:
# # test model
test_r2, test_mae, test_rmse = eval_regression(model, weather_daily_fs3_test_df, cls='Rainfall (mm)')
# performance on the test set
print(f'Test set performance: r2-score={test_r2}, mae={test_mae}, rmse={test_rmse}')

Test set performance: r2-score=0.0694, mae=6.4426, rmse=16.6803
