In [1]:
# Cell 1: imports and paths
import pandas as pd
import numpy as np
from pathlib import Path
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')

BASE = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
PROCESSED_DIR = BASE / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
in_csv = PROCESSED_DIR / "delhi_weather_combined.csv"
out_csv = PROCESSED_DIR / "delhi_weather_processed.csv"


In [7]:
import pandas as pd
import numpy as np

def load_csv_to_df(file_path: str) -> pd.DataFrame:
    """
    Load a CSV file into a pandas DataFrame.
    Automatically detects a datetime column if present.
    Renames latitude and longitude columns if needed.
    
    Parameters:
        file_path (str): Path to the CSV file.
    
    Returns:
        pd.DataFrame: Processed DataFrame
    """
    # Load CSV
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} rows and columns: {df.columns.tolist()}")

    # Ensure latitude and longitude column names
    if 'lat' not in df.columns:
        if 'latitude' in df.columns:
            df = df.rename(columns={'latitude': 'lat'})
    if 'lon' not in df.columns:
        if 'longitude' in df.columns:
            df = df.rename(columns={'longitude': 'lon'})

    # Detect a datetime column automatically
    datetime_col = None
    for col in df.columns:
        # Try to convert each column to datetime, if succeeds assume it is a datetime
        try:
            df[col] = pd.to_datetime(df[col])
            datetime_col = col
            print(f"Detected datetime column: {col}")
            break
        except (ValueError, TypeError):
            continue

    # If a datetime column was detected, rename to 'datetime'
    if datetime_col is not None:
        df = df.rename(columns={datetime_col: 'datetime'})
    else:
        print("No datetime-like column found. Continuing without datetime.")

    return df

# Example usage
if __name__ == "__main__":
    file_path = "/Users/panavdawar/Documents/resilienceAI /delhi_weather_combined.csv"  # replace with your CSV path
    df = load_csv_to_df(file_path)
    print(df.head())


Loaded 473256 rows and columns: ['lat', 'lon', 'temp', 'dewpoint', 'year', 'month']
Detected datetime column: lat
                       datetime    lon       temp   dewpoint  year  month
0 1970-01-01 00:00:00.000000028  76.80  279.80692  279.00247  2018      1
1 1970-01-01 00:00:00.000000028  77.05  280.09027  279.36990  2018      1
2 1970-01-01 00:00:00.000000028  77.30  280.31174  279.76862  2018      1
3 1970-01-01 00:00:00.000000028  76.80  279.84137  278.79138  2018      1
4 1970-01-01 00:00:00.000000028  77.05  280.15378  279.24002  2018      1


In [8]:
# Cell 3: add time features
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour


In [None]:

#  Vectorized RH + Heat Index 
def compute_rh(temp_c, dewpoint_c):
    # temp_c, dewpoint_c are Pandas Series or numpy arrays in Â°C
    # Magnus-Tetens formula (approx)
    es = 6.11 * np.exp((17.27 * temp_c) / (237.3 + temp_c))
    e = 6.11 * np.exp((17.27 * dewpoint_c) / (237.3 + dewpoint_c))
    rh = 100.0 * (e / es)
    return np.clip(rh, 0, 100)

df['RH'] = compute_rh(df['temp'], df['dewpoint'])


In [None]:

# converted from F-formula
def heat_index_celsius(T, RH):


    T_F = T * 9/5 + 32
    HI_F = (-42.379
            + 2.04901523 * T_F
            + 10.14333127 * RH
            - 0.22475541 * T_F * RH
            - 0.00683783 * T_F * T_F
            - 0.05481717 * RH * RH
            + 0.00122874 * T_F * T_F * RH
            + 0.00085282 * T_F * RH * RH
            - 0.00000199 * T_F * T_F * RH * RH)
    HI_C = (HI_F - 32) * 5/9
    return HI_C

df['heat_index'] = heat_index_celsius(df['temp'], df['RH'])


In [None]:

for col in ['latitude', 'Latitude']:
    if col in df.columns:
        df = df.rename(columns={col: 'lat'})
        break

for col in ['longitude', 'Longitude', 'long']:
    if col in df.columns:
        df = df.rename(columns={col: 'lon'})
        break

if 'heat_index' not in df.columns:
    print("Warning: 'heat_index' column missing. Consider computing it.")
if 'RH' not in df.columns:
    print("Warning: 'RH' column missing. Consider computing it.")

core_cols = [c for c in ['temp','dewpoint','lat','lon','heat_index'] if c in df.columns]
df = df.dropna(subset=core_cols)

if 'temp' in df.columns:
    df = df[(df['temp'] > -50) & (df['temp'] < 60)]
if 'RH' in df.columns:
    df = df[(df['RH'] >= 0) & (df['RH'] <= 100)]

logging.info("After cleaning rows=%d", len(df))
df.to_csv(out_csv, index=False)
logging.info("Saved processed CSV to %s", out_csv)


2026-01-22 11:37:40,621 INFO:After cleaning rows=0
2026-01-22 11:37:40,623 INFO:Saved processed CSV to /Users/panavdawar/Documents/resilienceAI /data/processed/delhi_weather_processed.csv
