In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# ---------------- config ----------------
DATA_PATH = r"./../Data Given for Challenge/data/normalized_features.csv"
OUT_PATH  = "tree_weather_forecast.csv"

LAGS = [1, 2, 3, 6, 12, 24]     # hours back
ROLLS = [3, 6, 12, 24]          # rolling means
FORECAST_HORIZON = 24           # next 24h

# ---------------- load data ----------------
df = pd.read_csv(DATA_PATH)
df["timestamp"] = pd.to_datetime(df["Timestamp"])
df = df.drop(columns=["Timestamp"])

# ensure proper dtypes
df["Location"] = df["Location"].astype(int)

# ---------------- feature engineering ----------------
df = df.sort_values(["Location", "timestamp"])
feature_cols = [c for c in df.columns if c not in ["timestamp", "Location", "out"]]

# create lag features
for lag in LAGS:
    df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
        df.groupby("Location")[feature_cols + ["out"]].shift(lag)
    )

# create rolling means
for win in ROLLS:
    df[[f"{col}_roll{win}" for col in feature_cols + ["out"]]] = (
        df.groupby("Location")[feature_cols + ["out"]].rolling(win).mean().reset_index(level=0, drop=True)
    )

# drop rows with NaN from lags/rolls
df = df.dropna().reset_index(drop=True)

# ---------------- train / val split ----------------
cutoff = df["timestamp"].max() - pd.Timedelta(hours=FORECAST_HORIZON)
train = df[df["timestamp"] <= cutoff]
test  = df[df["timestamp"] > cutoff]

X_train = train.drop(columns=["timestamp", "Location", "out"])
y_train = train["out"]

X_test = test.drop(columns=["timestamp", "Location", "out"])
y_test = test["out"]

# ---------------- model ----------------
# LightGBM
lgbm = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
lgbm.fit(X_train, y_train)

# XGBoost
xgb = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist"   # fast and works on CPU/GPU
)
xgb.fit(X_train, y_train)


# ---------------- evaluate ----------------
preds_lgbm = lgbm.predict(X_test)
preds_xgb  = xgb.predict(X_test)

rmse_lgbm = np.sqrt(mean_squared_error(y_test, preds_lgbm))
rmse_xgb  = np.sqrt(mean_squared_error(y_test, preds_xgb))

print(f"✅ LightGBM RMSE: {rmse_lgbm:.4f}")
print(f"✅ XGBoost RMSE: {rmse_xgb:.4f}")

# ---------------- save predictions ----------------
out_df = test[["timestamp", "Location", "out"]].copy()
out_df["pred_lgbm"] = preds_lgbm
out_df["pred_xgb"] = preds_xgb
out_df.to_csv(OUT_PATH, index=False)

print(f"📂 Saved predictions to {OUT_PATH}")
print(out_df.head())


  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in featu

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.403640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 225642
[LightGBM] [Info] Number of data points in the train set: 177323, number of used features: 1022
[LightGBM] [Info] Start training from score 45.345855
✅ LightGBM RMSE: 27.1308
✅ XGBoost RMSE: 25.8646


PermissionError: [Errno 13] Permission denied: 'tree_weather_forecast.csv'

In [2]:
out_df = test[["timestamp", "Location", "out"]].copy()
out_df["pred_lgbm"] = preds_lgbm
out_df["pred_xgb"] = preds_xgb
out_df.to_csv(OUT_PATH, index=False)

print(f"📂 Saved predictions to {OUT_PATH}")
print(out_df.head())

📂 Saved predictions to tree_weather_forecast.csv
                 timestamp  Location  out  pred_lgbm  pred_xgb
175210 2023-06-29 01:00:00         0  0.0  -6.973624 -5.824666
175211 2023-06-29 01:00:00         0  0.0   3.044404 -2.630587
175212 2023-06-29 01:00:00         0  1.0   1.592150  1.219545
175213 2023-06-29 01:00:00         0  0.0  -0.089828  0.002258
175214 2023-06-29 01:00:00         0  0.0   0.175897  0.745867
