In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm

# ⚙️ Configuration
LAGS = [1, 24, 48]
external_features = ['spv', 'temp', 'holiday', 'hour', 'day', 'month']

# 📁 Load historical ES data
df_es = pd.read_csv("/kaggle/input/spaindata/historical_metering_data_ES.csv")
df_es.rename(columns={df_es.columns[0]: "DATETIME"}, inplace=True)
df_es['DATETIME'] = pd.to_datetime(df_es['DATETIME'])
df_es.set_index('DATETIME', inplace=True)

# 📁 Load weather file
df_weather = pd.read_excel("/kaggle/input/tempppp/spv_ec00_forecasts_es_it.xlsx", parse_dates=["DATETIME"])
df_weather.set_index("DATETIME", inplace=True)

# 🔁 Merge features
df_es = df_es.merge(df_weather[["spv", "temp"]], how="left", left_index=True, right_index=True)
df_es.fillna(0, inplace=True)

# 🕓 Time features
df_es["hour"] = df_es.index.hour
df_es["day"] = df_es.index.day
df_es["month"] = df_es.index.month

# 🇪🇸 Spain Holidays
holiday_dates = [
    "2022-01-01", "2022-01-06", "2022-04-15", "2022-08-15", "2022-10-12", "2022-11-01", "2022-12-06", "2022-12-08",
    "2023-01-06", "2023-04-06", "2023-04-07", "2023-05-01", "2023-08-15", "2023-10-12", "2023-11-01", "2023-12-06", "2023-12-08", "2023-12-25",
    "2024-01-01", "2024-01-06", "2024-03-28", "2024-03-29", "2024-05-01", "2024-08-15"
]
holiday_dates = pd.to_datetime(holiday_dates).date
df_es['holiday'] = pd.Series(df_es.index.date, index=df_es.index).isin(holiday_dates).astype(int)

# 🔍 Customer columns
customer_cols_es = [col for col in df_es.columns if col.startswith("VALUEMWHMETERINGDATA_customerES_")]

# 📅 Prepare future_df (real features for August)
future_df_es = df_weather.loc["2024-08-01":"2024-08-31 23:00:00", ["spv", "temp"]].copy()
future_df_es["hour"] = future_df_es.index.hour
future_df_es["day"] = future_df_es.index.day
future_df_es["month"] = future_df_es.index.month
future_df_es["holiday"] = pd.Series(future_df_es.index.date, index=future_df_es.index).isin(holiday_dates).astype(int)

# 🔮 Forecast loop for Spain
forecast_august_es = {}

def create_lagged_features(series, lags):
    return pd.concat({f'lag_{l}': series.shift(l) for l in lags}, axis=1)

for cust in tqdm(customer_cols_es, desc="🔮 Predicting August 2024 (Spain)"):
    try:
        data = df_es[[cust] + external_features].copy()
        data[cust] = data[cust].fillna(0)

        # Add lag features to training data
        lagged = create_lagged_features(data[cust], LAGS)
        train_df = pd.concat([lagged, data[external_features], data[cust]], axis=1).dropna()

        X_train = train_df.drop(columns=[cust])
        y_train = train_df[cust]

        # Prepare X_future with external features
        X_future = future_df_es[external_features].copy()

        # 🔧 Add dummy lag columns (0) to match training structure
        for lag in ['lag_1', 'lag_24', 'lag_48']:
            X_future[lag] = 0.0
        X_future = X_future[X_train.columns]  # match exact order

        model = xgb.XGBRegressor(n_estimators=300, learning_rate=0.05, n_jobs=-1)
        model.fit(X_train, y_train)

        preds = model.predict(X_future)
        forecast_august_es[cust] = preds

    except Exception as e:
        print(f"⚠️ Skipping {cust}: {e}")
