In [1]:
# Cell 1: imports and paths
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')

BASE = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
PROCESSED_DIR = BASE / "data" / "processed"
MODELS_DIR = BASE / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)
in_csv = PROCESSED_DIR / "delhi_weather_processed.csv"
model_out = MODELS_DIR / "rf_model.pkl"


In [2]:
# Cell 2: load processed data
df = pd.read_csv(in_csv, parse_dates=['datetime'])
logging.info("Loaded processed rows=%d", len(df))


2026-01-22 11:38:03,383 INFO:Loaded processed rows=0


In [5]:
# Ensure datetime exists
if 'datetime' not in df.columns:
    raise KeyError("No datetime column found. Cannot perform daily aggregation.")

df['date'] = pd.to_datetime(df['datetime']).dt.date

# Check for lat/lon or fallback to only date
group_cols = ['date']
if 'lat' in df.columns:
    group_cols.append('lat')
else:
    print("Warning: 'lat' not found. Aggregating only by date.")
if 'lon' in df.columns:
    group_cols.append('lon')
else:
    print("Warning: 'lon' not found. Aggregating only by date.")

# Only aggregate columns that exist
agg_cols = {c: (c,'mean') for c in ['temp','RH','heat_index'] if c in df.columns}
daily = df.groupby(group_cols).agg(**agg_cols).reset_index()
logging.info("Daily aggregated rows=%d", len(daily))


2026-01-22 11:40:04,285 INFO:Daily aggregated rows=0




In [6]:
# Cell 4: baseline rule-based risk_label (0 low, 1 moderate, 2 high)
daily['risk_label'] = 0
# moderate threshold and high threshold (you can tune)
daily.loc[(daily['heat_index'] >= 38) & (daily['heat_index'] < 45), 'risk_label'] = 1
daily.loc[(daily['heat_index'] >= 45), 'risk_label'] = 2
logging.info("Risk label distribution:\n%s", daily['risk_label'].value_counts().to_string())
# Save daily CSV
daily.to_csv(PROCESSED_DIR / "delhi_daily_grid.csv", index=False)


2026-01-22 11:40:09,053 INFO:Risk label distribution:
Series([], )


In [9]:
# --- Ensure lat/lon columns exist ---
for col in ['latitude','Latitude','y']:
    if col in df.columns:
        df = df.rename(columns={col:'lat'})
        break
if 'lat' not in df.columns:
    print("Warning: 'lat' not found. Aggregation will skip lat dimension.")

for col in ['longitude','Longitude','x']:
    if col in df.columns:
        df = df.rename(columns={col:'lon'})
        break
if 'lon' not in df.columns:
    print("Warning: 'lon' not found. Aggregation will skip lon dimension.")

# --- Ensure datetime exists ---
if 'datetime' not in df.columns:
    if 'time' in df.columns:
        df['datetime'] = pd.to_datetime(df['time'])
    else:
        print("Warning: No datetime column. Using index as date fallback.")
        df['datetime'] = pd.to_datetime(pd.Series(range(len(df))), errors='coerce')

# --- Create date column ---
df['date'] = pd.to_datetime(df['datetime'], errors='coerce').dt.date

# --- Daily aggregation ---
group_cols = ['date']
if 'lat' in df.columns:
    group_cols.append('lat')
if 'lon' in df.columns:
    group_cols.append('lon')

agg_cols = {c: (c,'mean') for c in ['temp','RH','heat_index'] if c in df.columns}
daily = df.groupby(group_cols).agg(**agg_cols).reset_index()

print("Rows after daily aggregation:", len(daily))
print(daily.head())


Rows after daily aggregation: 0
Empty DataFrame
Columns: [date, lon, temp, RH, heat_index]
Index: []


In [11]:
import matplotlib.pyplot as plt
import logging
from pathlib import Path

BASE = Path(".")  # adjust if needed

# Check if clf exists
try:
    clf
except NameError:
    raise NameError("RandomForest model 'clf' not found. Train the model first before plotting feature importances.")

# Check if features exist
if 'features' not in globals() or len(features) == 0:
    raise ValueError("Feature list 'features' not defined. Cannot plot importances.")

# Plot feature importances
importances = clf.feature_importances_
plt.figure(figsize=(4,3))
plt.bar(features, importances)
plt.title("Feature importances")
plt.tight_layout()

# Ensure output directory exists
plot_path = BASE / "outputs" / "plots" / "rf_feature_importance.png"
plot_path.parent.mkdir(parents=True, exist_ok=True)

plt.savefig(plot_path)
logging.info(f"Saved feature importance plot to {plot_path}")
plt.show()


NameError: RandomForest model 'clf' not found. Train the model first before plotting feature importances.