# ML Model Development - Grid search
Grid-search: Exhaustively search all parameter combinations in a given grid to determine the best model.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define the path to the dataset
dataset_path = "../DataPreprocessing/Dataset/SriLanka_Weather_Dataset new.csv"

# Load the dataset into a Pandas DataFrame
weather_df = pd.read_csv(dataset_path)

print("Dataset Information:")
print(weather_df.info())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147480 entries, 0 to 147479
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   time                        147480 non-null  object 
 1   weathercode                 147480 non-null  int64  
 2   temperature_2m_max          147480 non-null  float64
 3   temperature_2m_min          147480 non-null  float64
 4   temperature_2m_mean         147480 non-null  float64
 5   apparent_temperature_max    147480 non-null  float64
 6   apparent_temperature_min    147480 non-null  float64
 7   apparent_temperature_mean   147480 non-null  float64
 8   sunrise                     147480 non-null  object 
 9   sunset                      147480 non-null  object 
 10  shortwave_radiation_sum     147480 non-null  float64
 11  precipitation_sum           147480 non-null  float64
 12  rain_sum                    147480 non-null  float6

## Dataset Preprocessing

In [2]:
# Convert 'time' column to datetime format and extract year, month, and day
weather_df['time'] = pd.to_datetime(weather_df['time'])
weather_df['year'] = weather_df['time'].dt.year
weather_df['month'] = weather_df['time'].dt.month
weather_df['day'] = weather_df['time'].dt.day

# Convert 'sunrise' and 'sunset' columns to datetime format and extract only time
weather_df['sunrise'] = pd.to_datetime(weather_df['sunrise']).dt.time
weather_df['sunset'] = pd.to_datetime(weather_df['sunset']).dt.time
weather_df['sunrise'] = pd.to_datetime(weather_df['sunrise'], format='%H:%M:%S').dt.hour * 60 + pd.to_datetime(weather_df['sunrise'], format='%H:%M:%S').dt.minute
weather_df['sunset'] = pd.to_datetime(weather_df['sunset'], format='%H:%M:%S').dt.hour * 60 + pd.to_datetime(weather_df['sunset'], format='%H:%M:%S').dt.minute

# Rename 'precipitation_hours' to 'rain_hours'
weather_df.rename(columns={'precipitation_hours': 'rain_hours'}, inplace=True)

# Remove unnecessary columns
columns_to_remove = ['precipitation_sum', 'snowfall_sum', 'latitude', 'longitude', 'elevation', 'country', 'time']
weather_df.drop(columns=columns_to_remove, inplace=True)

# Reorder columns
column_order = ['city', 'year', 'month', 'day', 'weathercode', 'temperature_2m_max', 'temperature_2m_min',
                'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min',
                'apparent_temperature_mean', 'sunrise', 'sunset', 'shortwave_radiation_sum',
                'rain_sum', 'rain_hours', 'windspeed_10m_max', 'windgusts_10m_max',
                'winddirection_10m_dominant', 'et0_fao_evapotranspiration']

# Rearrange columns
weather_df = weather_df[column_order]

# Display the first few rows of the modified dataset
weather_df.head()

Unnamed: 0,city,year,month,day,weathercode,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,shortwave_radiation_sum,rain_sum,rain_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,et0_fao_evapotranspiration
0,Colombo,2010,1,1,2,30.0,22.7,26.1,34.4,25.2,29.2,52,755,20.92,0.0,0,11.7,27.4,20,4.58
1,Colombo,2010,1,2,51,29.9,23.5,26.2,33.8,26.2,29.8,52,756,17.71,0.1,1,13.0,27.0,24,3.84
2,Colombo,2010,1,3,51,29.5,23.2,26.0,34.3,26.3,29.9,53,756,17.76,0.6,3,12.3,27.4,16,3.65
3,Colombo,2010,1,4,2,28.9,21.9,25.3,31.6,23.4,27.8,53,757,16.5,0.0,0,17.0,34.6,356,3.79
4,Colombo,2010,1,5,1,28.1,21.3,24.5,30.1,23.1,26.1,53,757,23.61,0.0,0,18.7,37.1,355,4.97


### Define independent variables (features) and dependent variable (target)

In [3]:
# Define independent variables (features)
X = weather_df[['city', 'year', 'month', 'day']]

# Define dependent variable (target)
y = weather_df[['weathercode', 'temperature_2m_max', 'temperature_2m_min',
                    'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min',
                    'apparent_temperature_mean', 'sunrise', 'sunset', 'shortwave_radiation_sum',
                    'rain_sum', 'rain_hours', 'windspeed_10m_max', 'windgusts_10m_max',
                    'winddirection_10m_dominant', 'et0_fao_evapotranspiration']]

# Display the first few rows of independent variables
X.head()

Unnamed: 0,city,year,month,day
0,Colombo,2010,1,1
1,Colombo,2010,1,2
2,Colombo,2010,1,3
3,Colombo,2010,1,4
4,Colombo,2010,1,5


In [4]:
# Display the first few rows of dependent variables
y.head()

Unnamed: 0,weathercode,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,shortwave_radiation_sum,rain_sum,rain_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,et0_fao_evapotranspiration
0,2,30.0,22.7,26.1,34.4,25.2,29.2,52,755,20.92,0.0,0,11.7,27.4,20,4.58
1,51,29.9,23.5,26.2,33.8,26.2,29.8,52,756,17.71,0.1,1,13.0,27.0,24,3.84
2,51,29.5,23.2,26.0,34.3,26.3,29.9,53,756,17.76,0.6,3,12.3,27.4,16,3.65
3,2,28.9,21.9,25.3,31.6,23.4,27.8,53,757,16.5,0.0,0,17.0,34.6,356,3.79
4,1,28.1,21.3,24.5,30.1,23.1,26.1,53,757,23.61,0.0,0,18.7,37.1,355,4.97


In [5]:
import pickle

# Load previously saved encoding files
with open('../DataPreprocessing/X_encoded.pkl', 'rb') as file:
    X_encoded = pickle.load(file)

with open('../DataPreprocessing/y_encoded.pkl', 'rb') as file:
    y_encoded = pickle.load(file)

# Display the first few rows of the encoded X
X_encoded.head()

Unnamed: 0,city_Badulla,city_Bentota,city_Colombo,city_Galle,city_Gampaha,city_Hambantota,city_Hatton,city_Jaffna,city_Kalmunai,city_Kalutara,...,day_29,day_3,day_30,day_31,day_4,day_5,day_6,day_7,day_8,day_9
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [6]:
# Display the first few rows of the encoded y
y_encoded.head()

Unnamed: 0,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,shortwave_radiation_sum,rain_sum,...,et0_fao_evapotranspiration,weathercode_1,weathercode_2,weathercode_3,weathercode_51,weathercode_53,weathercode_55,weathercode_61,weathercode_63,weathercode_65
0,30.0,22.7,26.1,34.4,25.2,29.2,52,755,20.92,0.0,...,4.58,False,True,False,False,False,False,False,False,False
1,29.9,23.5,26.2,33.8,26.2,29.8,52,756,17.71,0.1,...,3.84,False,False,False,True,False,False,False,False,False
2,29.5,23.2,26.0,34.3,26.3,29.9,53,756,17.76,0.6,...,3.65,False,False,False,True,False,False,False,False,False
3,28.9,21.9,25.3,31.6,23.4,27.8,53,757,16.5,0.0,...,3.79,False,True,False,False,False,False,False,False,False
4,28.1,21.3,24.5,30.1,23.1,26.1,53,757,23.61,0.0,...,4.97,True,False,False,False,False,False,False,False,False


## RF Model Development

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestRegressor

def train_RF(n_est, depth, X_train, y_train, X_test, y_test):
    rf = RandomForestRegressor(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    
    # Assuming your y_test is a DataFrame with the same structure as y_train
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print('Estimators: {} / Depth: {} ---- MSE: {} / R2: {}'.format(
        n_est, depth, round(mse, 3), round(r2, 3)))
    
    return mse, r2

best_params = {'n_estimators': None, 'max_depth': None}
best_metrics = {'mse': float('inf'), 'r2': -float('inf')}

for n_est in [10, 50, 100, 150, 200]:
    for depth in [10, 20, 30, 50, None]:
        mse, r2 = train_RF(n_est, depth, X_train, y_train, X_test, y_test)
        
        # Update best metrics and parameters
        if mse + r2 < best_metrics['mse'] + best_metrics['r2']:
            best_metrics['mse'] = mse
            best_metrics['r2'] = r2
            best_params['n_estimators'] = n_est
            best_params['max_depth'] = depth

print('\nBest Parameters:')
print('Estimators: {}, Depth: {} ---- MSE: {} / R2: {}'.format(
    best_params['n_estimators'], best_params['max_depth'],
    round(best_metrics['mse'], 3), round(best_metrics['r2'], 3)))

Estimators: 10 / Depth: 10 ---- MSE: 242.85 / R2: 0.216
Estimators: 10 / Depth: 20 ---- MSE: 197.794 / R2: 0.356
Estimators: 10 / Depth: 30 ---- MSE: 185.231 / R2: 0.408
Estimators: 10 / Depth: 50 ---- MSE: 178.94 / R2: 0.458
Estimators: 10 / Depth: None ---- MSE: 185.994 / R2: 0.476
Estimators: 50 / Depth: 10 ---- MSE: 242.742 / R2: 0.215
Estimators: 50 / Depth: 20 ---- MSE: 195.463 / R2: 0.361
Estimators: 50 / Depth: 30 ---- MSE: 178.97 / R2: 0.419
Estimators: 50 / Depth: 50 ---- MSE: 179.005 / R2: 0.475
Estimators: 50 / Depth: None ---- MSE: 182.409 / R2: 0.496
Estimators: 100 / Depth: 10 ---- MSE: 241.854 / R2: 0.217
Estimators: 100 / Depth: 20 ---- MSE: 195.509 / R2: 0.362
Estimators: 100 / Depth: 30 ---- MSE: 177.774 / R2: 0.421
Estimators: 100 / Depth: 50 ---- MSE: 176.17 / R2: 0.479
Estimators: 100 / Depth: None ---- MSE: 179.658 / R2: 0.5
Estimators: 150 / Depth: 10 ---- MSE: 242.628 / R2: 0.217
Estimators: 150 / Depth: 20 ---- MSE: 195.113 / R2: 0.362
Estimators: 150 / Depth:

In [10]:
from sklearn.ensemble import RandomForestRegressor

def train_RF(n_est, depth, X_train, y_train, X_test, y_test):
    rf = RandomForestRegressor(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    
    # Assuming your y_test is a DataFrame with the same structure as y_train
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print('Estimators: {} / Depth: {} ---- MSE: {} / R2: {}'.format(
        n_est, depth, round(mse, 3), round(r2, 3)))
    
    return mse, r2

best_params = {'n_estimators': None, 'max_depth': None}
best_metrics = {'mse': float('inf'), 'r2': -float('inf')}

for n_est in [200, 250, 300, 400, 500, 600, 800, 1000]:
    for depth in [None]:
        mse, r2 = train_RF(n_est, depth, X_train, y_train, X_test, y_test)
        
        # Update best metrics and parameters
        if mse + r2 < best_metrics['mse'] + best_metrics['r2']:
            best_metrics['mse'] = mse
            best_metrics['r2'] = r2
            best_params['n_estimators'] = n_est
            best_params['max_depth'] = depth

print('\nBest Parameters:')
print('Estimators: {}, Depth: {} ---- MSE: {} / R2: {}'.format(
    best_params['n_estimators'], best_params['max_depth'],
    round(best_metrics['mse'], 3), round(best_metrics['r2'], 3)))

Estimators: 200 / Depth: None ---- MSE: 178.267 / R2: 0.502
Estimators: 250 / Depth: None ---- MSE: 178.086 / R2: 0.502
Estimators: 300 / Depth: None ---- MSE: 178.009 / R2: 0.504
Estimators: 400 / Depth: None ---- MSE: 178.3 / R2: 0.503
Estimators: 500 / Depth: None ---- MSE: 178.383 / R2: 0.503


KeyboardInterrupt: 