In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, accuracy_score
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('./dataset/delhi_weather.csv')
data

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),precipitation (mm),rain (mm),weather_code (wmo code),cloud_cover (%),surface_pressure (hPa),wind_speed_10m (km/h),wind_direction_10m (°),is_day ()
0,2000-01-01T00:00,9.4,84,0.0,0.0,0,0,990.4,9.3,106,0
1,2000-01-01T01:00,8.7,86,0.0,0.0,0,0,990.0,8.0,95,0
2,2000-01-01T02:00,8.3,86,0.0,0.0,0,0,989.7,7.6,82,0
3,2000-01-01T03:00,8.0,87,0.0,0.0,0,0,990.0,6.9,84,0
4,2000-01-01T04:00,7.9,87,0.0,0.0,0,0,989.7,6.0,73,0
...,...,...,...,...,...,...,...,...,...,...,...
221347,2025-04-01T19:00,29.4,15,0.0,0.0,0,0,982.7,6.2,260,0
221348,2025-04-01T20:00,28.0,17,0.0,0.0,0,0,983.2,5.2,239,0
221349,2025-04-01T21:00,26.9,18,0.0,0.0,0,0,983.6,5.0,257,0
221350,2025-04-01T22:00,26.2,19,0.0,0.0,0,0,983.5,5.2,292,0


In [None]:
df = data[['time','temperature_2m (°C)', 'relative_humidity_2m (%)', 'wind_direction_10m (°)',
    'cloud_cover (%)', 'wind_speed_10m (km/h)', 'surface_pressure (hPa)', 'precipitation (mm)', 'weather_code (wmo code)']].copy()
df.head()

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),wind_direction_10m (°),cloud_cover (%),wind_speed_10m (km/h),surface_pressure (hPa),precipitation (mm),weather_code (wmo code)
0,2000-01-01T00:00,9.4,84,106,0,9.3,990.4,0.0,0
1,2000-01-01T01:00,8.7,86,95,0,8.0,990.0,0.0,0
2,2000-01-01T02:00,8.3,86,82,0,7.6,989.7,0.0,0
3,2000-01-01T03:00,8.0,87,84,0,6.9,990.0,0.0,0
4,2000-01-01T04:00,7.9,87,73,0,6.0,989.7,0.0,0


In [None]:
# Global variables
regression_targets = [
    'temperature_2m (°C)', 'relative_humidity_2m (%)', 'wind_direction_10m (°)',
    'cloud_cover (%)', 'wind_speed_10m (km/h)', 'surface_pressure (hPa)'
]


In [None]:
def create_lag_features(df, columns, max_lag):
    lags = [df]
    for col in columns:
        for lag in range(1, max_lag + 1):
            lags.append(df[col].shift(lag).rename(f'{col}_lag{lag}'))
    return pd.concat(lags, axis=1)

def assign_weights(df, center_year=2025, decay=0.5):
    df = df.copy()
    df['year'] = df['time'].dt.year
    df['sample_weight'] = np.exp(-decay * (center_year - df['year']))
    return df

def extract_time_features(df):
    # df['time'] = pd.to_datetime(df['time'])
    df['hour'] = df['time'].dt.hour
    df['dayofyear'] = df['time'].dt.dayofyear
    df['month'] = df['time'].dt.month
    df['dayofweek'] = df['time'].dt.dayofweek
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['doy_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365.25)
    df['doy_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365.25)
    return df


In [None]:
def train_model(df, curr_year = 2025):
    df['time'] = pd.to_datetime(df['time'])
    df = df.sort_values('time').reset_index(drop=True)
    df['time_numeric'] = (df['time'] - df['time'].min()).dt.total_seconds() / 3600
    df = extract_time_features(df)
    df = create_lag_features(df, regression_targets + ['precipitation (mm)'], max_lag=1)
    df.dropna(inplace=True)
    df = assign_weights(df, center_year=2025, decay=0.5)

    train_df = df[df['time'].dt.year <= curr_year-5]
    test_df = df[df['time'].dt.year > curr_year-5]
    feature_cols = [col for col in df.columns if ('lag' in col or '_sin' in col or '_cos' in col)] #+ ['time_numeric', 'hour', 'dayofyear', 'month', 'dayofweek']

    # Train regression models
    regressors = {}
    for target in regression_targets:
        reg = XGBRegressor()
        reg.fit(train_df[feature_cols], train_df[target], sample_weight=train_df['sample_weight'])
        regressors[target] = reg
        preds = reg.predict(test_df[feature_cols])
        print(f"\nTraining regressor for: {target}")
        print(f"MSE: {mean_squared_error(test_df[target], preds):.4f}")
        print(f"MAE: {mean_absolute_error(test_df[target], preds):.4f}")
        print(f"R^2: {r2_score(test_df[target], preds):.4f}")

    # Weather classifier
    regression_targets
    X_cls = df[df['time'].dt.year <= curr_year-5][feature_cols + regression_targets]
    y_cls = df[df['time'].dt.year <= curr_year-5]['weather_code (wmo code)']
    X_test_cls = df[df['time'].dt.year > curr_year-5][feature_cols + regression_targets]
    y_test_cls = df[df['time'].dt.year > curr_year-5]['weather_code (wmo code)']
    clf = RandomForestClassifier(random_state=42, class_weight='balanced')
    clf.fit(X_cls, y_cls)

    y_pred_cls = clf.predict(X_test_cls)
    print(f"\nWeather Classification Accuracy: {accuracy_score(y_test_cls, y_pred_cls):.4f}")
    print("\nWeather Code Classification Report:")
    print(classification_report(y_test_cls, y_pred_cls))

    # Precipitation regression
    precip_reg = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
    precip_reg.fit(train_df[feature_cols], train_df['precipitation (mm)'])
    precip1_test = test_df
    precip1_test['precip_pred'] = precip_reg.predict(precip1_test[feature_cols])
    print(f"\nRainy Hour Precipitation - MSE: {mean_squared_error(precip1_test['precipitation (mm)'], precip1_test['precip_pred']):.4f}")
    print(f"MAE: {mean_absolute_error(precip1_test['precipitation (mm)'], precip1_test['precip_pred']):.4f}")
    print(f"R^2: {r2_score(precip1_test['precipitation (mm)'], precip1_test['precip_pred']):.4f}")

    return regressors, clf, precip_reg


In [None]:
reg , clf, precip_reg = train_model(df.copy())


Training regressor for: temperature_2m (°C)
MSE: 0.4937
MAE: 0.4631
R^2: 0.9927

Training regressor for: relative_humidity_2m (%)
MSE: 11.8990
MAE: 2.2834
R^2: 0.9801

Training regressor for: wind_direction_10m (°)
MSE: 3561.2148
MAE: 30.3436
R^2: 0.6801

Training regressor for: cloud_cover (%)
MSE: 345.8774
MAE: 10.5823
R^2: 0.7837

Training regressor for: wind_speed_10m (km/h)
MSE: 4.9487
MAE: 1.5079
R^2: 0.7817

Training regressor for: surface_pressure (hPa)
MSE: 0.1082
MAE: 0.2357
R^2: 0.9974

Weather Classification Accuracy: 0.9235

Weather Code Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22237
           1       0.91      0.98      0.94      3523
           2       0.89      0.95      0.92      1834
           3       0.87      0.95      0.91      6201
          51       0.52      0.40      0.45      2190
          53       0.15      0.05      0.07       423
          55       0.14      0.01      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Rainy Hour Precipitation - MSE: 0.4640
MAE: 0.1034
R^2: 0.2363


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  precip1_test['precip_pred'] = precip_reg.predict(precip1_test[feature_cols])


In [None]:
def predict(date_time, df=df.copy()):

    date_time = pd.to_datetime(date_time)
    latest_time = pd.to_datetime(df['time'].max())

    # BASE CASE: If date_time already exists in df, skip
    if date_time > latest_time + pd.Timedelta(hours=48):
        print('error: date_time exceeds 2 days limit from latest time')
        return None, df
    if date_time in df['time'].values:
        return df[df['time'] == date_time], df

    # STEP 1: Predict all previous hours recursively first
    prev_time = date_time - pd.Timedelta(hours=1)
    if prev_time > latest_time:
        _, df = predict(prev_time, df)

    # STEP 2: Now predict for current time (date_time)
    new_df = pd.DataFrame({'time': [date_time]})
    df['time'] = pd.to_datetime(df['time'])
    new_df['time'] = pd.to_datetime(new_df['time'])
    new_df['time_numeric'] = (new_df['time'].iloc[0] - df['time'].min()).total_seconds() / 3600

    new_df = extract_time_features(new_df)

    # Add lag features
    columns = regression_targets + ['precipitation (mm)']
    for col in columns:
        for lag in range(1, 1+1):
            lag_time = date_time - pd.Timedelta(hours=lag)
            val = df.loc[df['time'] == lag_time, col]
            new_df[f'{col}_lag{lag}'] = val.values[0] if not val.empty else np.nan

    # Select features
    feature_cols = [col for col in new_df.columns if ('lag' in col or '_sin' in col or '_cos' in col)] #+ ['time_numeric', 'hour', 'dayofyear', 'month', 'dayofweek']

    if new_df[feature_cols].isnull().any().any():
        print(f"[WARNING] Missing lag values at {date_time}, prediction may be less accurate.")

    # Predict core variables
    for target in regression_targets:
        new_df[f'{target}'] = reg[target].predict(new_df[feature_cols])

    # Weather code
    new_df['weather_code (wmo code)'] = clf.predict(new_df[feature_cols + regression_targets])

    # Precipitation
    new_df['precipitation (mm)'] = precip_reg.predict(new_df[feature_cols])

    # Merge into df
    df = pd.concat([df, new_df], ignore_index=True).drop_duplicates('time')
    df = df.sort_values('time').reset_index(drop=True)

    return new_df, df


In [52]:
new_df, updated_df = predict(pd.to_datetime('2025-04-03 00:00:00'), df.copy())

In [53]:
new_df[regression_targets + ['precipitation (mm)'] + ['weather_code (wmo code)']]

Unnamed: 0,temperature_2m (°C),relative_humidity_2m (%),wind_direction_10m (°),cloud_cover (%),wind_speed_10m (km/h),surface_pressure (hPa),precipitation (mm),weather_code (wmo code)
0,22.468004,34.55909,270.752991,16.077505,9.610062,982.234802,0.003503,0


In [54]:
pred_df = updated_df[updated_df['time'] >= pd.to_datetime('2025-04-01 23:00:00')]
pred_df[['time'] + regression_targets + ['precipitation (mm)'] + ['weather_code (wmo code)']]

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),wind_direction_10m (°),cloud_cover (%),wind_speed_10m (km/h),surface_pressure (hPa),precipitation (mm),weather_code (wmo code)
221352,2025-04-02 00:00:00,23.195787,28.97159,292.5401,-1.662227,7.45253,983.177551,-0.002841,0
221353,2025-04-02 01:00:00,22.09223,32.494118,282.840454,0.522884,9.014236,982.777771,-0.002841,0
221354,2025-04-02 02:00:00,21.337292,35.005455,277.692352,-0.111521,9.473309,982.25708,-0.002841,0
221355,2025-04-02 03:00:00,21.00396,35.242645,274.945679,1.341067,9.615222,982.080078,-0.002841,0
221356,2025-04-02 04:00:00,20.784292,34.580212,269.591217,4.990577,9.700864,982.063293,-0.0025,0
221357,2025-04-02 05:00:00,20.382717,40.81736,262.86731,13.043246,9.883069,982.380737,-0.002829,0
221358,2025-04-02 06:00:00,20.317953,40.705769,266.450623,19.942247,10.389713,982.997375,-0.00011,1
221359,2025-04-02 07:00:00,22.876438,33.812969,270.370911,23.662216,11.045959,983.879211,0.003734,1
221360,2025-04-02 08:00:00,26.088509,27.776693,273.249023,27.746376,11.784018,984.784241,0.004358,1
221361,2025-04-02 09:00:00,28.788862,23.091658,278.478699,31.477345,12.628861,985.252197,0.005487,1


In [None]:
pred_df.to_csv('./predictions/prediction_df.csv', index=False)

OSError: Cannot save file into a non-existent directory: 'predictions'