In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR
from xgboost import XGBRegressor, XGBClassifier
print('Libraries imported.')

In [None]:
#load dataset
df = pd.read_csv("mohali_5_year_weather.csv")  # change file name as needed
df.head()

In [None]:
#basic info
df.info()
df.describe()

In [None]:
#Standardize date & columns 
df['date'] = pd.to_datetime(df['date'])
df = df.rename(columns={col: col.lower().strip() for col in df.columns})
df.head()

In [None]:
# 5 year trend plot temperature
plt.figure(figsize=(14,4))
plt.plot(df['date'], df['avgtemp_c'], label = 'Temperature (Â°C)')
plt.title("5-Year Trend of Temperature")
plt.xlabel("Date")
plt.ylabel("Temperature")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
#  Rainfall Trend (Last 5 Years)
plt.figure(figsize=(12,4))
plt.plot(df['date'], df['rainfall_mm'], label='Rainfall (mm)')
plt.title('5-Year Rainfall Trend')
plt.xlabel('Date'); plt.ylabel('Rainfall (mm)')
plt.grid(True); plt.legend(); plt.show()

In [None]:
#plot histograms & boxplots
fig, ax = plt.subplots(1,2, figsize=(12,4))
sns.histplot(df['avgtemp_c'], kde=True, ax=ax[0])
ax[0].set_title("Temperature Distribution")
sns.boxplot(y=df['avgtemp_c'], ax=ax[1])
ax[1].set_title("Temperature Boxplot")
plt.show()

In [None]:
# Histograms & Boxplots for Rainfall
fig, axes = plt.subplots(1, 2, figsize=(12, 4))  # 1 row, 2 columns

axes[0].hist(df['rainfall_mm'], bins=30, )
axes[0].set_title('Rainfall Histogram')
axes[0].set_xlabel('Rainfall (mm)')
axes[0].set_ylabel('Frequency')

axes[1].boxplot(df['rainfall_mm'])
axes[1].set_title('Rainfall Boxplot')
axes[1].set_ylabel('Rainfall (mm)')

plt.show()

In [None]:
#Correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
#creating a Rain (1) / No-Rain(0) binary classified indicator column.
df['rainfall_mm_binary'] = (df['rainfall_mm'] > 0).astype(int)


In [None]:
# Lag feature: temperature from previous day
df['temp_lag1'] = df['avgtemp_c'].shift(1)

# 3-day rolling average of temperature (avg of prev days temp)
df['temp_roll3'] = df['avgtemp_c'].rolling(window=3).mean()

# Remove rows with NaN after lag/rolling
df.dropna(inplace=True)

In [None]:
# train 80% test 20% split of our data 
X = df[['temp_lag1', 'humidity_lag1', 'temp_roll3']]
y = df['avgtemp_c']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print("Train/Test shapes (Reg):", X_train.shape, X_test.shape)

In [None]:
#training regression models
rf = RandomForestRegressor()
svr = SVR()
xgb = XGBRegressor()

rf.fit(X_train, y_train)
svr.fit(X_train, y_train)
xgb.fit(X_train, y_train)

In [None]:
#Regression evaluation 
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

pred_rf = rf.predict(X_test)
pred_svr = svr.predict(X_test)
pred_xgb = xgb.predict(X_test)

# Calculate RMSE manually for older sklearn versions
rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
rmse_svr = np.sqrt(mean_squared_error(y_test, pred_svr))
rmse_xgb = np.sqrt(mean_squared_error(y_test, pred_xgb))

print("RF RMSE & R2:", rmse_rf, r2_score(y_test, pred_rf))
print("SVR RMSE & R2:", rmse_svr, r2_score(y_test, pred_svr))
print("XGB RMSE & R2:", rmse_xgb, r2_score(y_test, pred_xgb))
