In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
dataset = pd.read_csv("/content/chennai_rainfall_2019_2023_mm.csv")

print("="*50)
print("RAIN CLASSIFICATION WITH BASIC FEATURES")
print("="*50)

# Convert to datetime
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset['Year'] = dataset['Date'].dt.year

# Create binary target variable
dataset['Rain_Today'] = (dataset['Rainfall (mm)'] > 0).astype(int)

# Use only the three specified features
X = dataset[['Temperature (°C)', 'Humidity (%)', 'Wind Speed (km/h)']]
y = dataset['Rain_Today']

# Remove any rows with missing values
dataset_clean = X.join(y).dropna()
X = dataset_clean[['Temperature (°C)', 'Humidity (%)', 'Wind Speed (km/h)']]
y = dataset_clean['Rain_Today']

# Time-based split (train on 2019-2022, test on 2023)
X_train = X[dataset['Year'] <= 2022]
X_test = X[dataset['Year'] == 2023]
y_train = y[dataset['Year'] <= 2022]
y_test = y[dataset['Year'] == 2023]

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Rainy days in train: {y_train.sum()} ({y_train.mean()*100:.1f}%)")
print(f"Rainy days in test: {y_test.sum()} ({y_test.mean()*100:.1f}%)")

# Train Random Forest classifier
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

model.fit(X_train, y_train)

# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
y_train_proba = model.predict_proba(X_train)[:, 1]
y_test_proba = model.predict_proba(X_test)[:, 1]

# Calculate metrics in your requested format
# Using Brier Score for MSE equivalent
train_mse = ((y_train_proba - y_train) ** 2).mean()
test_mse = ((y_test_proba - y_test) ** 2).mean()

# Using MAE for probabilities
train_mae = np.abs(y_train_proba - y_train).mean()
test_mae = np.abs(y_test_proba - y_test).mean()

# Using accuracy for R² equivalent
train_r2 = accuracy_score(y_train, y_train_pred)
test_r2 = accuracy_score(y_test, y_test_pred)

# Calculate AUC
train_auc = roc_auc_score(y_train, y_train_proba)
test_auc = roc_auc_score(y_test, y_test_proba)

# Print results in your exact format
print("Training MSE:", train_mse)
print("Testing MSE:", test_mse)
print("Training MAE:", train_mae)
print("Testing MAE:", test_mae)
print("Training R²:", train_r2)
print("Testing R²:", test_r2)

RAIN CLASSIFICATION WITH BASIC FEATURES
Training set: 1461 samples
Test set: 365 samples
Rainy days in train: 1198 (82.0%)
Rainy days in test: 317 (86.8%)
Training MSE: 0.10507552488505054
Testing MSE: 0.12460337493250284
Training MAE: 0.2428485619585973
Testing MAE: 0.26538414819671285
Training R²: 0.836413415468857
Testing R²: 0.8602739726027397
