# ML Model Development

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define the path to the dataset
dataset_path = "../DataPreprocessing/Dataset/SriLanka_Weather_Dataset new.csv"

# Load the dataset into a Pandas DataFrame with 'time' column as index
weather_df = pd.read_csv(dataset_path, parse_dates=['time'], index_col='time')

print("Dataset Information:")
print(weather_df.info())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 147480 entries, 2010-01-01 to 2023-06-17
Data columns (total 23 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   weathercode                 147480 non-null  int64  
 1   temperature_2m_max          147480 non-null  float64
 2   temperature_2m_min          147480 non-null  float64
 3   temperature_2m_mean         147480 non-null  float64
 4   apparent_temperature_max    147480 non-null  float64
 5   apparent_temperature_min    147480 non-null  float64
 6   apparent_temperature_mean   147480 non-null  float64
 7   sunrise                     147480 non-null  object 
 8   sunset                      147480 non-null  object 
 9   shortwave_radiation_sum     147480 non-null  float64
 10  precipitation_sum           147480 non-null  float64
 11  rain_sum                    147480 non-null  float64
 12  snowfall_sum                147480 

## Dataset Preprocessing

In [2]:
# Extract year, month, and day
weather_df['year'] = weather_df.index.year
weather_df['month'] = weather_df.index.month
weather_df['day'] = weather_df.index.day

# Convert 'sunrise' and 'sunset' columns to datetime format and extract only time
weather_df['sunrise'] = pd.to_datetime(weather_df['sunrise']).dt.time
weather_df['sunset'] = pd.to_datetime(weather_df['sunset']).dt.time
weather_df['sunrise'] = pd.to_datetime(weather_df['sunrise'], format='%H:%M:%S').dt.hour * 60 + pd.to_datetime(weather_df['sunrise'], format='%H:%M:%S').dt.minute
weather_df['sunset'] = pd.to_datetime(weather_df['sunset'], format='%H:%M:%S').dt.hour * 60 + pd.to_datetime(weather_df['sunset'], format='%H:%M:%S').dt.minute

# Rename 'precipitation_hours' to 'rain_hours'
weather_df.rename(columns={'precipitation_hours': 'rain_hours'}, inplace=True)

# Remove unnecessary columns
columns_to_remove = ['precipitation_sum', 'snowfall_sum', 'latitude', 'longitude', 'elevation', 'country']
weather_df.drop(columns=columns_to_remove, inplace=True)

# Reorder columns
column_order = ['city', 'year', 'month', 'day', 'weathercode', 'temperature_2m_max', 'temperature_2m_min',
                'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min',
                'apparent_temperature_mean', 'sunrise', 'sunset', 'shortwave_radiation_sum',
                'rain_sum', 'rain_hours', 'windspeed_10m_max', 'windgusts_10m_max',
                'winddirection_10m_dominant', 'et0_fao_evapotranspiration']

# Rearrange columns
weather_df = weather_df[column_order]

# Display the first few rows of the modified dataset
weather_df.head()

Unnamed: 0_level_0,city,year,month,day,weathercode,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,shortwave_radiation_sum,rain_sum,rain_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,et0_fao_evapotranspiration
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2010-01-01,Colombo,2010,1,1,2,30.0,22.7,26.1,34.4,25.2,29.2,52,755,20.92,0.0,0,11.7,27.4,20,4.58
2010-01-02,Colombo,2010,1,2,51,29.9,23.5,26.2,33.8,26.2,29.8,52,756,17.71,0.1,1,13.0,27.0,24,3.84
2010-01-03,Colombo,2010,1,3,51,29.5,23.2,26.0,34.3,26.3,29.9,53,756,17.76,0.6,3,12.3,27.4,16,3.65
2010-01-04,Colombo,2010,1,4,2,28.9,21.9,25.3,31.6,23.4,27.8,53,757,16.5,0.0,0,17.0,34.6,356,3.79
2010-01-05,Colombo,2010,1,5,1,28.1,21.3,24.5,30.1,23.1,26.1,53,757,23.61,0.0,0,18.7,37.1,355,4.97


### Define independent variables (features) and dependent variable (target)

In [3]:
# Define independent variables (features)
X = weather_df[['city', 'year', 'month', 'day']]

# Define dependent variable (target)
y = weather_df[['weathercode', 'temperature_2m_max', 'temperature_2m_min',
                    'temperature_2m_mean', 'apparent_temperature_max', 'apparent_temperature_min',
                    'apparent_temperature_mean', 'sunrise', 'sunset', 'shortwave_radiation_sum',
                    'rain_sum', 'rain_hours', 'windspeed_10m_max', 'windgusts_10m_max',
                    'winddirection_10m_dominant', 'et0_fao_evapotranspiration']]

# Display the first few rows of independent variables
X.head()

Unnamed: 0_level_0,city,year,month,day
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,Colombo,2010,1,1
2010-01-02,Colombo,2010,1,2
2010-01-03,Colombo,2010,1,3
2010-01-04,Colombo,2010,1,4
2010-01-05,Colombo,2010,1,5


In [4]:
# Display the first few rows of dependent variables
y.head()

Unnamed: 0_level_0,weathercode,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,shortwave_radiation_sum,rain_sum,rain_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,et0_fao_evapotranspiration
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2010-01-01,2,30.0,22.7,26.1,34.4,25.2,29.2,52,755,20.92,0.0,0,11.7,27.4,20,4.58
2010-01-02,51,29.9,23.5,26.2,33.8,26.2,29.8,52,756,17.71,0.1,1,13.0,27.0,24,3.84
2010-01-03,51,29.5,23.2,26.0,34.3,26.3,29.9,53,756,17.76,0.6,3,12.3,27.4,16,3.65
2010-01-04,2,28.9,21.9,25.3,31.6,23.4,27.8,53,757,16.5,0.0,0,17.0,34.6,356,3.79
2010-01-05,1,28.1,21.3,24.5,30.1,23.1,26.1,53,757,23.61,0.0,0,18.7,37.1,355,4.97


In [5]:
import pickle

# Load previously saved encoding files
with open('../DataPreprocessing/X_encoded.pkl', 'rb') as file:
    X_encoded = pickle.load(file)

with open('../DataPreprocessing/y_encoded.pkl', 'rb') as file:
    y_encoded = pickle.load(file)

# Display the first few rows of the encoded X
X_encoded.head()

Unnamed: 0_level_0,city_Badulla,city_Bentota,city_Colombo,city_Galle,city_Gampaha,city_Hambantota,city_Hatton,city_Jaffna,city_Kalmunai,city_Kalutara,...,day_29,day_3,day_30,day_31,day_4,day_5,day_6,day_7,day_8,day_9
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2010-01-02,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2010-01-03,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2010-01-04,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2010-01-05,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [11]:
# Display the first few rows of the encoded y
y_encoded.head()

Unnamed: 0_level_0,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,shortwave_radiation_sum,rain_sum,...,et0_fao_evapotranspiration,weathercode_1,weathercode_2,weathercode_3,weathercode_51,weathercode_53,weathercode_55,weathercode_61,weathercode_63,weathercode_65
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01,30.0,22.7,26.1,34.4,25.2,29.2,52,755,20.92,0.0,...,4.58,False,True,False,False,False,False,False,False,False
2010-01-02,29.9,23.5,26.2,33.8,26.2,29.8,52,756,17.71,0.1,...,3.84,False,False,False,True,False,False,False,False,False
2010-01-03,29.5,23.2,26.0,34.3,26.3,29.9,53,756,17.76,0.6,...,3.65,False,False,False,True,False,False,False,False,False
2010-01-04,28.9,21.9,25.3,31.6,23.4,27.8,53,757,16.5,0.0,...,3.79,False,True,False,False,False,False,False,False,False
2010-01-05,28.1,21.3,24.5,30.1,23.1,26.1,53,757,23.61,0.0,...,4.97,True,False,False,False,False,False,False,False,False


In [13]:
y_encoded.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 147480 entries, 2010-01-01 to 2023-06-17
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   temperature_2m_max          147480 non-null  float64
 1   temperature_2m_min          147480 non-null  float64
 2   temperature_2m_mean         147480 non-null  float64
 3   apparent_temperature_max    147480 non-null  float64
 4   apparent_temperature_min    147480 non-null  float64
 5   apparent_temperature_mean   147480 non-null  float64
 6   sunrise                     147480 non-null  int32  
 7   sunset                      147480 non-null  int32  
 8   shortwave_radiation_sum     147480 non-null  float64
 9   rain_sum                    147480 non-null  float64
 10  rain_hours                  147480 non-null  int64  
 11  windspeed_10m_max           147480 non-null  float64
 12  windgusts_10m_max           147480 non-null  float64
 13

## RF Model Development

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.1, random_state=42)

# Define Random Forest model with hyperparameters
rf_model = RandomForestRegressor(n_estimators=200, bootstrap=True,
                                    max_depth=None, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Predictions on training and testing data
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

# Calculate RMSE for training and testing data
train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))

# Calculate R-squared for training and testing data
train_r2_rf = r2_score(y_train, y_train_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)

# Print the metrics
print("\nRF Model - Train RMSE: {:.2f}".format(train_rmse_rf))
print("RF Model - Test RMSE: {:.2f}".format(test_rmse_rf))
print("RF Model - Train R2: {:.2f}".format(train_r2_rf))
print("RF Model - Test R: {:.2f}".format(test_r2_rf))


RF Model - Train RMSE: 4.88
RF Model - Test RMSE: 13.12
RF Model - Train R2: 0.93
RF Model - Test R: 0.50


In [8]:
# Calculate Accuracy in Percentages
def calculate_accuracy(r2_score):
    return (r2_score * 100).round(2)  # Convert R-squared to percentage

# Calculate accuracy for Random Forest Model
train_accuracy_rf = calculate_accuracy(train_r2_rf)
test_accuracy_rf = calculate_accuracy(test_r2_rf)

print("\nRF Model - Train Accuracy: {:.2f}%".format(train_accuracy_rf))
print("RF Model - Test Accuracy: {:.2f}%".format(test_accuracy_rf))


RF Model - Train Accuracy: 93.25%
RF Model - Test Accuracy: 50.12%
