In [1]:
import os.path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# for Q-Q plots
import scipy.stats as stats
# from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

from sklearn import metrics

from sklearn.datasets import make_blobs
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV



from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.simplefilter('ignore')

In [2]:
ds = pd.read_csv('D:/3Kurs/1Sem/SS/Practice/rgr/data/weatherHistory_clean.csv')
ds.head()

Unnamed: 0.1,Unnamed: 0,Temperature,Humidity,Wind_Speed,Wind_Bearing,Visibility,Pressure,Partly Cloudy,Humid,Rain,...,Overcast,Windy,Dangerously Windy,Foggy,Drizzle,Mostly Cloudy,Snow,Year,Month,Hour
0,0,0.506975,0.89,0.360609,0.699164,15.8263,0.491705,1,0,1,...,0,0,0,0,0,0,0,2006,3,22
1,1,0.505085,0.86,0.364309,0.721448,15.8263,0.499837,1,0,1,...,0,0,0,0,0,0,0,2006,3,23
2,2,0.505445,0.89,0.100329,0.568245,14.9569,0.50488,0,0,1,...,0,0,0,0,0,1,0,2006,4,0
3,3,0.487805,0.83,0.360197,0.749304,15.8263,0.512524,1,0,1,...,0,0,0,0,0,0,0,2006,4,1
4,4,0.495365,0.83,0.282072,0.721448,15.8263,0.514151,0,0,1,...,0,0,0,0,0,1,0,2006,4,2


In [3]:
X = ds.drop(['Humidity'], axis=1)
y = ds['Humidity']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
rf = RandomForestRegressor(random_state = 42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # або np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2):", r2)

Mean Absolute Error (MAE): 0.04881749530956847
Mean Squared Error (MSE): 0.004553599637794453
Root Mean Squared Error (RMSE): 0.06748036483151564
R-squared (R2): 0.8810384661321047


### I will determine which parameters I will look for.

In [6]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

### Search for the best hyperparameters

In [7]:
rf_model = RandomForestRegressor(random_state = 42)

param_distributions = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['sqrt', 'log2', None]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_distributions,
    n_iter=50,  # Number of parameter settings to sample
    scoring='neg_mean_squared_error',  # Scoring metric
    cv=5,  # Cross-validation splits
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_params_random = random_search.best_params_
best_score_random = -random_search.best_score_

print("Best parameters (random reserch):", best_params_random)
print("Best mean squared error (MSE) (random reserch):", best_score_random)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters (random reserch): {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': None}
Best mean squared error (MSE) (random reserch): 0.00478613919477963


#### I decided to use a random search to find the superparems, because since I have a large dataset, the search by grid would take a very long time.
### So in the end, the best hyperparameters are {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': None}