In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV

# ---------------- config ----------------
DATA_PATH = r"./../Data Given for Challenge/data/normalized_features.csv"
OUT_PATH  = "tree_weather_forecast.csv"

LAGS = [1, 2, 3, 6, 12, 24]     # hours back
ROLLS = [3, 6, 12, 24]          # rolling means
FORECAST_HORIZON = 24           # next 24h

# ---------------- load data ----------------
df = pd.read_csv(DATA_PATH)
df["timestamp"] = pd.to_datetime(df["Timestamp"])
df = df.drop(columns=["Timestamp"])

# ensure proper dtypes
df["Location"] = df["Location"].astype(int)

# ---------------- feature engineering ----------------
df = df.sort_values(["Location", "timestamp"])
feature_cols = [c for c in df.columns if c not in ["timestamp", "Location", "out"]]

# create lag features
for lag in LAGS:
    df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
        df.groupby("Location")[feature_cols + ["out"]].shift(lag)
    )

# create rolling means
for win in ROLLS:
    df[[f"{col}_roll{win}" for col in feature_cols + ["out"]]] = (
        df.groupby("Location")[feature_cols + ["out"]].rolling(win).mean().reset_index(level=0, drop=True)
    )

# drop rows with NaN from lags/rolls
df = df.dropna().reset_index(drop=True)

# ---------------- train / val split ----------------
cutoff = df["timestamp"].max() - pd.Timedelta(hours=FORECAST_HORIZON)
train = df[df["timestamp"] <= cutoff]
test  = df[df["timestamp"] > cutoff]

X_train = train.drop(columns=["timestamp", "Location", "out"])
y_train = train["out"]

X_test = test.drop(columns=["timestamp", "Location", "out"])
y_test = test["out"]

# ---------------- define RMSE scorer ----------------
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)

# ---------------- LightGBM hyperparameter tuning ----------------
param_dist_lgbm = {
    'num_leaves': [32, 64, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [1000, 1500, 2000],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_samples': [20, 30, 50]
}

lgbm = LGBMRegressor(random_state=42, device="gpu")

rs_lgbm = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist_lgbm,
    n_iter=30,
    scoring=rmse_scorer,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rs_lgbm.fit(X_train, y_train)
print("Best LightGBM params:", rs_lgbm.best_params_)
print("Best LightGBM RMSE (CV):", -rs_lgbm.best_score_)

# ---------------- XGBoost hyperparameter tuning ----------------
param_dist_xgb = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [500, 1000, 1500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

xgb = XGBRegressor(random_state=42, tree_method="gpu_hist")

rs_xgb = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist_xgb,
    n_iter=30,
    scoring=rmse_scorer,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rs_xgb.fit(X_train, y_train)
print("Best XGBoost params:", rs_xgb.best_params_)
print("Best XGBoost RMSE (CV):", -rs_xgb.best_score_)

# ---------------- final models with best parameters ----------------
best_lgbm = rs_lgbm.best_estimator_
best_xgb = rs_xgb.best_estimator_

preds_lgbm = best_lgbm.predict(X_test)
preds_xgb  = best_xgb.predict(X_test)

rmse_lgbm = np.sqrt(mean_squared_error(y_test, preds_lgbm))
rmse_xgb  = np.sqrt(mean_squared_error(y_test, preds_xgb))

print(f"✅ Optimized LightGBM RMSE: {rmse_lgbm:.4f}")
print(f"✅ Optimized XGBoost RMSE: {rmse_xgb:.4f}")

# ---------------- save predictions ----------------
out_df = test[["timestamp", "Location", "out"]].copy()
out_df["pred_lgbm"] = preds_lgbm
out_df["pred_xgb"] = preds_xgb
out_df.to_csv(OUT_PATH, index=False)

print(f"📂 Saved predictions to {OUT_PATH}")
print(out_df.head())


  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in feature_cols + ["out"]]] = (
  df[[f"{col}_lag{lag}" for col in featu

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 225071
[LightGBM] [Info] Number of data points in the train set: 175379, number of used features: 1015
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 770 dense feature groups (129.12 MB) transferred to GPU in 0.066308 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 43.895535
Best LightGBM params: {'subsample': 1.0, 'num_leaves': 64, 'n_estimators': 1000, 'min_child_samples': 30, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Best LightGBM RMSE (CV): 114.19083541211084
Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [2]:
out_df = test[["timestamp", "Location", "out"]].copy()
out_df["pred_lgbm"] = preds_lgbm
out_df["pred_xgb"] = preds_xgb
out_df.to_csv(OUT_PATH, index=False)

print(f"📂 Saved predictions to {OUT_PATH}")
print(out_df.head())

📂 Saved predictions to tree_weather_forecast.csv
                 timestamp  Location  out  pred_lgbm  pred_xgb
175210 2023-06-29 01:00:00         0  0.0  -6.973624 -5.824666
175211 2023-06-29 01:00:00         0  0.0   3.044404 -2.630587
175212 2023-06-29 01:00:00         0  1.0   1.592150  1.219545
175213 2023-06-29 01:00:00         0  0.0  -0.089828  0.002258
175214 2023-06-29 01:00:00         0  0.0   0.175897  0.745867
