In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR
from xgboost import XGBRegressor, XGBClassifier
print('Libraries imported.')

In [None]:
#load dataset
df = pd.read_csv("mohali_5_year_weather.csv")  # change file name as needed
df.head()

In [None]:
#basic info
df.info()
df.describe()

In [None]:
#Standardize date & columns 
df['date'] = pd.to_datetime(df['date'])
df = df.rename(columns={col: col.lower().strip() for col in df.columns})
df.head()

In [None]:
# 5 year trend plot temperature
plt.figure(figsize=(14,4))
plt.plot(df['date'], df['avgtemp_c'], label = 'Temperature (°C)')
plt.title("5-Year Trend of Temperature")
plt.xlabel("Date")
plt.ylabel("Temperature")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
#  Rainfall Trend (Last 5 Years)
plt.figure(figsize=(12,4))
plt.plot(df['date'], df['rainfall_mm'], label='Rainfall (mm)')
plt.title('5-Year Rainfall Trend')
plt.xlabel('Date'); plt.ylabel('Rainfall (mm)')
plt.grid(True); plt.legend(); plt.show()

In [None]:
#plot histograms & boxplots
fig, ax = plt.subplots(1,2, figsize=(12,4))
sns.histplot(df['avgtemp_c'], kde=True, ax=ax[0])
ax[0].set_title("Temperature Distribution")
sns.boxplot(y=df['avgtemp_c'], ax=ax[1])
ax[1].set_title("Temperature Boxplot")
plt.show()

In [None]:
# Histograms & Boxplots for Rainfall
fig, axes = plt.subplots(1, 2, figsize=(12, 4))  # 1 row, 2 columns

axes[0].hist(df['rainfall_mm'], bins=30, )
axes[0].set_title('Rainfall Histogram')
axes[0].set_xlabel('Rainfall (mm)')
axes[0].set_ylabel('Frequency')

axes[1].boxplot(df['rainfall_mm'])
axes[1].set_title('Rainfall Boxplot')
axes[1].set_ylabel('Rainfall (mm)')

plt.show()

In [None]:
#Correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
#creating a Rain (1) / No-Rain(0) binary classified indicator column.
df['rainfall_mm_binary'] = (df['rainfall_mm'] > 0).astype(int)


In [None]:
# Lag feature: temperature from previous day
df['temp_lag1'] = df['avgtemp_c'].shift(1)

# 3-day rolling average of temperature (avg of prev days temp)
df['temp_roll3'] = df['avgtemp_c'].rolling(window=3).mean()

# Remove rows with NaN after lag/rolling
df.dropna(inplace=True)

In [None]:
# train 80% test 20% split of our data 
X = df[['temp_lag1', 'humidity_lag1', 'temp_roll3']]
y = df['avgtemp_c']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print("Train/Test shapes (Reg):", X_train.shape, X_test.shape)

In [None]:
#training regression models
rf = RandomForestRegressor()
svr = SVR()
xgb = XGBRegressor()

rf.fit(X_train, y_train)
svr.fit(X_train, y_train)
xgb.fit(X_train, y_train)

In [None]:
#Regression evaluation 
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

pred_rf = rf.predict(X_test)
pred_svr = svr.predict(X_test)
pred_xgb = xgb.predict(X_test)

# Calculate RMSE manually for older sklearn versions
rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
rmse_svr = np.sqrt(mean_squared_error(y_test, pred_svr))
rmse_xgb = np.sqrt(mean_squared_error(y_test, pred_xgb))

print("RF RMSE & R2:", rmse_rf, r2_score(y_test, pred_rf))
print("SVR RMSE & R2:", rmse_svr, r2_score(y_test, pred_svr))
print("XGB RMSE & R2:", rmse_xgb, r2_score(y_test, pred_xgb))


In [None]:
#R2 comparison plots
models = ['RF','SVR','XGB']
scores = [r2_score(y_test, pred_rf), r2_score(y_test, pred_svr), r2_score(y_test, pred_xgb)]
plt.figure(figsize=(5,4))
sns.barplot(x=models, y=scores)
for i,v in enumerate(scores): plt.text(i, v+0.01, f'{v:.3f}', ha='center')
plt.title("R2 score Comparison")
plt.xlabel('Models'); plt.ylabel('R2 score')
plt.ylim(0,1)
plt.show()

In [None]:
#training Classification models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# feature & target columns
Xc = df[['rain_lag1', 'humidity_lag1', 'rain_roll3']]
yc = df['rainfall_mm_binary']   

# Train-test split
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, shuffle=False)

# Models
rf_cls = RandomForestClassifier()
xgb_cls = XGBClassifier(eval_metric='logloss')  # required for older XGB versions

# Train
rf_cls.fit(Xc_train, yc_train)
xgb_cls.fit(Xc_train, yc_train)

# Predict
pred_rf_cls = rf_cls.predict(Xc_test)
pred_xgb_cls = xgb_cls.predict(Xc_test)

In [None]:
#classification results
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Random Forest Accuracy:", accuracy_score(yc_test, pred_rf_cls))
print(confusion_matrix(yc_test, pred_rf_cls))
print(classification_report(yc_test, pred_rf_cls))

print("\nXGBoost Accuracy:", accuracy_score(yc_test, pred_xgb_cls))
print(confusion_matrix(yc_test, pred_xgb_cls))
print(classification_report(yc_test, pred_xgb_cls))

In [None]:
#  ROC & Precision-Recall Curves (Classification)
from sklearn.metrics import roc_curve, auc, precision_recall_curve

rf_probs = rf_cls.predict_proba(Xc_test)[:,1]
xgb_probs = xgb_cls.predict_proba(Xc_test)[:,1]

for name, probs in [('RF',rf_probs),('XGB',xgb_probs)]:
        fpr, tpr, _ = roc_curve(yc_test, probs)
        pr, rc, _ = precision_recall_curve(yc_test, probs)
        roc_auc = auc(fpr, tpr)

        plt.figure(figsize=(6,4))
        plt.plot(fpr, tpr, label=f'{name} AUC={roc_auc:.3f}')
        plt.plot([0,1],[0,1],'k--')
        plt.title(f'{name} ROC Curve'); plt.xlabel('FPR'); plt.ylabel('TPR')
        plt.legend(); plt.grid(True); plt.show()

        plt.figure(figsize=(6,4))
        plt.plot(rc, pr, label=name)
        plt.title(f'{name} Precision-Recall Curve'); plt.xlabel('Recall'); plt.ylabel('Precision')
        plt.legend(); plt.grid(True); plt.show()

In [None]:
#confusion matrices for model predicting rain vs no-rain
labels = ["No Rain (0) ","Rain (1)"]

cm_rf = confusion_matrix(yc_test, pred_rf_cls)
cm_xgb = confusion_matrix(yc_test, pred_xgb_cls)

fig, ax = plt.subplots(1,2, figsize=(12,4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap="Blues", xticklabels=labels, yticklabels=labels, ax=ax[0])
ax[0].set_title("Random Forest")
ax[0].set_xlabel("Predicted")
ax[0].set_ylabel("Actual")

sns.heatmap(cm_xgb, annot=True, fmt='d', cmap="Greens", xticklabels=labels, yticklabels=labels, ax=ax[1])
ax[1].set_title("XGBoost")
ax[1].set_xlabel("Predicted")
ax[1].set_ylabel("Actual")

plt.show()

In [None]:
# ---- Recursive Forecast for Next 30 Days ----
future_preds = []
future_temp_vals = []  

# Initialize with last known real values
last_temp = df['avgtemp_c'].iloc[-1]
last_humidity = df['avghumidity'].iloc[-1]
recent_temps = list(df['avgtemp_c'].iloc[-3:])  

for i in range(30):
    temp_lag1 = last_temp
    humidity_lag1 = last_humidity
    temp_roll3 = np.mean(recent_temps)

    # Create one-row dataframe for prediction
    future_input = pd.DataFrame({
        'temp_lag1': [temp_lag1],
        'humidity_lag1': [humidity_lag1],
        'temp_roll3': [temp_roll3]
    })

    # Predict next day temperature (RF)
    next_temp = rf.predict(future_input)[0] 
    
    # Store prediction
    future_preds.append(next_temp)
    future_temp_vals.append(next_temp)

    # Update rolling values
    last_temp = next_temp
    recent_temps.append(next_temp)
    recent_temps = recent_temps[-3:]  

# ---- Create dates for future forecast ----
last_date = df['date'].iloc[-1]
forecast_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=30)

# ---- Plot ----
plt.figure(figsize=(12,5))
plt.plot(df['date'], df['avgtemp_c'], label='Historical Temp', linewidth=2)
plt.plot(forecast_dates, future_preds, marker='o', linewidth=3, label='Recursive RandomForecast')
plt.title("30-Day Recursive Temperature Forecast")
plt.ylabel("Temperature (°C)")
plt.xlabel("Date")
plt.legend()
plt.grid(True)
plt.show()

# ---- Zoomed ----
plt.figure(figsize=(10,4))
plt.plot(forecast_dates, future_preds, marker='o', linewidth=3, label="Recursive Random Forecast", color='orange')
plt.title("Next 30-Day Recursive Temperature Forecast (Zoomed)")
plt.ylabel("Temperature (°C)")
plt.xlabel("Date")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# ---- Save Forecast to CSV ----
forecast_df = pd.DataFrame({
    "Date": forecast_dates,
    "Predicted_Temp": future_preds
})

forecast_df.to_csv("30_day_temperature_forecast.csv", index=False)

print("✅ 30-day forecast saved to '30_day_temperature_forecast.csv'")

In [None]:
# Actual vs Predicted Temperature (Scatter)
plt.figure(figsize=(6,6))
plt.scatter(y_test, pred_rf, alpha=0.5, label='RF')
plt.scatter(y_test, pred_xgb, alpha=0.5, label='XGB')
plt.plot(y_test, y_test, 'r--')
plt.xlabel('Actual Temp'); plt.ylabel('Predicted Temp')
plt.title('Actual vs Predicted Temperature')
plt.legend(); plt.grid(True); plt.show()