In [1]:
import numpy as np
import dask
import dask.array as da
import dask.dataframe as dd
from dask.distributed import Client
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

import ray
from ray import tune
from ray.tune.search.optuna import OptunaSearch
from ray.air import session
from ray.air.integrations.wandb import WandbLoggerCallback
import shap

import wandb
import os

  from .autonotebook import tqdm as notebook_tqdm
2025-01-31 20:50:18,680	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-01-31 20:50:19,516	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
wandb.login()
if not ray.is_initialized():
    ray.init(ignore_reinit_error=True)

client = Client()
dask.config.set(scheduler="threads")
dask.config.set({"dataframe.convert-string": True,
                 "dataframe.shuffle.method": "tasks"})

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/kuba/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
2025-01-31 20:50:34,713	INFO worker.py:1841 -- Started a local Ray instance.


<dask.config.set at 0x7f5003fd07f0>

In [3]:
data_path = "csv_partitions/*.csv"
df = dd.read_csv(data_path, assume_missing=True)

df = df.sample(frac=0.02, random_state=42)

unique_dates = df["date_id"].unique().compute()
unique_dates = np.sort(unique_dates)

def make_time_series_folds(unique_dates, n_folds=3, min_train_size=1):
    """
    Example time-series fold maker. 
    Adjust logic to ensure your folds are large enough.
    """
    n_dates = len(unique_dates)
    fold_size = n_dates // (n_folds + 1)

    folds = []
    for i in range(1, n_folds + 1):
        train_end = i * fold_size
        val_start = train_end
        val_end = (i + 1) * fold_size if i < n_folds else n_dates

        train_dates = unique_dates[:train_end]
        val_dates = unique_dates[val_start:val_end]

        if len(train_dates) < min_train_size:
            raise ValueError(f"Fold {i} has insufficient train data!")
        
        folds.append((train_dates, val_dates))
    return folds

folds = make_time_series_folds(unique_dates, n_folds=2)



In [4]:
def train_lgbm_model(X_train, y_train, w_train, 
                     X_val,   y_val,   w_val, 
                     model_params):
    lgb_params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": model_params["learning_rate"],
        "num_leaves": model_params["num_leaves"],
        "feature_fraction": model_params["feature_fraction"],
        "extra_trees": model_params["extra_trees"],
        "device_type": "cpu",
    }

    dtrain = lgb.Dataset(X_train, label=y_train, weight=w_train)
    dval   = lgb.Dataset(X_val,   label=y_val,   weight=w_val)

    callbacks = [
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100),
    ]

    model = lgb.train(
        params=lgb_params,
        train_set=dtrain,
        num_boost_round=3000,
        valid_sets=[dval],
        callbacks=callbacks
    )
    return model

def train_xgb_model(X_train, y_train, w_train,
                    X_val,   y_val,   w_val, 
                    model_params):
    xgb_params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "eta": model_params["learning_rate"],
        "max_depth": model_params["max_depth"],
        "subsample": model_params["subsample"],
        "colsample_bytree": model_params["colsample_bytree"],
        "tree_method": "hist",
        "device": "cuda",  # or "cpu" if no GPU
    }

    dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train)
    dval   = xgb.DMatrix(X_val,   label=y_val,   weight=w_val)

    model = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=3000,
        evals=[(dval, "valid")],
        early_stopping_rounds=100,
        verbose_eval=100,
    )
    return model

In [5]:
def weighted_ensemble(base_train_preds_list, base_val_preds_list, y_train, w_train, y_val, w_val, weights):
    """
    Direct weighted average (no additional meta-model).
    weights: array-like of shape (n_base_models,) >= 0
    If some model is "unused", pass weight=0 for it.
    """
    weights = np.array(weights, dtype=float)

    base_train_preds = np.column_stack(base_train_preds_list)
    base_val_preds   = np.column_stack(base_val_preds_list)

    wsum = np.sum(weights)
    if wsum < 1e-12:  # guard against division by zero
        wsum = float(base_val_preds.shape[1])
        weights = np.ones_like(weights)

    train_pred = np.average(base_train_preds, axis=1, weights=weights)
    val_pred   = np.average(base_val_preds,   axis=1, weights=weights)

    val_r2 = r2_score(y_val, val_pred, sample_weight=w_val)
    return val_r2

In [6]:
def train_with_time_series_cv(config):
    """
    - Time Series CV over 'folds'
    - Weighted ensemble of up to 5 LGB + 5 XGB
    - Optionally skip training some models by setting n_lgb_xgb
    - Tuning imputer strategy
    - Final metric = average R2 across folds
    """
    n_lgb, n_xgb = config["n_lgb_xgb"]
    
    feature_cols = [
        f"feature_{i:02d}" for i in range(79)
    ] + [f"responder_{i}_lag_1" for i in range(9)]
    target_col = "responder_6"
    weight_col = "weight"

    imputer_strategy = config["imputer_strategy"]
    if imputer_strategy == "constant":
        fill_val = config["imputer_fill_value"]
        imputer = SimpleImputer(strategy="constant", fill_value=fill_val)
    else:
        imputer = SimpleImputer(strategy=imputer_strategy)

    fold_r2_scores = []

    for fold_i, (train_dates, val_dates) in enumerate(folds):
        df_train_dd = df[df["date_id"].isin(train_dates)]
        df_val_dd   = df[df["date_id"].isin(val_dates)]

        df_train = df_train_dd[[*feature_cols, target_col, weight_col]].compute()
        df_val   = df_val_dd[[*feature_cols, target_col, weight_col]].compute()

        X_train = df_train[feature_cols].values
        y_train = df_train[target_col].values
        w_train = df_train[weight_col].values

        X_val   = df_val[feature_cols].values
        y_val   = df_val[target_col].values
        w_val   = df_val[weight_col].values

        X_train = imputer.fit_transform(X_train)
        X_val   = imputer.transform(X_val)

        base_train_preds_list = []
        base_val_preds_list   = []

        for i in range(1, n_lgb+1):
            lgb_params = {
                "learning_rate":    config[f"lgb_{i}__learning_rate"],
                "num_leaves":       config[f"lgb_{i}__num_leaves"],
                "feature_fraction": config[f"lgb_{i}__feature_fraction"],
                "extra_trees":      config[f"lgb_{i}__extra_trees"],
            }
            model = train_lgbm_model(X_train, y_train, w_train,
                                     X_val,   y_val,   w_val,
                                     lgb_params)
            base_train_preds_list.append(model.predict(X_train))
            base_val_preds_list.append(model.predict(X_val))

        for i in range(1, n_xgb+1):
            xgb_params = {
                "learning_rate":     config[f"xgb_{i}__learning_rate"],
                "max_depth":         config[f"xgb_{i}__max_depth"],
                "subsample":         config[f"xgb_{i}__subsample"],
                "colsample_bytree":  config[f"xgb_{i}__colsample_bytree"],
            }
            xgb_model = train_xgb_model(X_train, y_train, w_train,
                                        X_val,   y_val,   w_val,
                                        xgb_params)
            base_train_preds_list.append(xgb_model.predict(xgb.DMatrix(X_train)))
            base_val_preds_list.append(xgb_model.predict(xgb.DMatrix(X_val)))

        total_used = n_lgb + n_xgb
        max_base_models = 10
        while len(base_train_preds_list) < max_base_models:
            base_train_preds_list.append(np.zeros_like(y_train, dtype=float))
            base_val_preds_list.append(np.zeros_like(y_val,   dtype=float))

        weights = []
        for i in range(1, 6):
            weights.append(config[f"alpha_lgb_{i}"])
        for i in range(1, 6):
            weights.append(config[f"alpha_xgb_{i}"])

        fold_r2 = weighted_ensemble(
            base_train_preds_list, base_val_preds_list,
            y_train, w_train, y_val, w_val,
            weights
        )
        fold_r2_scores.append(fold_r2)

        del df_train, df_val, X_train, X_val
        del y_train, y_val, w_train, w_val
        del base_train_preds_list, base_val_preds_list

    mean_r2 = np.mean(fold_r2_scores)
    session.report({"r2": mean_r2})

In [7]:
def create_weighted_ensemble_search_space():
    search_space = {}
    
    search_space["n_lgb_xgb"] = tune.choice([(i, 5 - i) for i in range(6)])
    
    for i in range(1, 6):
        prefix = f"lgb_{i}"
        search_space[f"{prefix}__learning_rate"]    = tune.loguniform(0.001, 0.1)
        search_space[f"{prefix}__num_leaves"]       = tune.randint(31, 128)
        search_space[f"{prefix}__feature_fraction"] = tune.uniform(0.8, 1.0)
        search_space[f"{prefix}__extra_trees"]      = tune.choice([False, True])

    for i in range(1, 6):
        prefix = f"xgb_{i}"
        search_space[f"{prefix}__learning_rate"]     = tune.loguniform(0.001, 0.1)
        search_space[f"{prefix}__max_depth"]         = tune.randint(3, 15)
        search_space[f"{prefix}__subsample"]         = tune.uniform(0.8, 1.0)
        search_space[f"{prefix}__colsample_bytree"]  = tune.choice([0.8, 1.0])

    for i in range(1, 6):
        search_space[f"alpha_lgb_{i}"] = tune.uniform(0.0, 3.0)
    for i in range(1, 6):
        search_space[f"alpha_xgb_{i}"] = tune.uniform(0.0, 3.0)

    search_space["imputer_strategy"] = tune.choice(["mean", "median", "constant"])
    search_space["imputer_fill_value"] = tune.choice([0, 3, 9999])

    return search_space

In [8]:
RUN_HPO = False
base_dir = "/mnt/h/Studia/magisterskie/1_sem/ProjektSemestralny"
result_dir = os.path.join(base_dir, "ray_optuna_results")
os.makedirs(result_dir, exist_ok=True)

PROJECT_NAME = "jane-street-HPO-weighted-ensv3"

if RUN_HPO:
    search_space = create_weighted_ensemble_search_space()
    
    optuna_search = OptunaSearch(metric="r2", mode="max")

    wandb_callback = WandbLoggerCallback(
        project=PROJECT_NAME,
        entity="kubston20004",
        log_config=True
    )

    tuner = tune.Tuner(
        tune.with_resources(
            train_with_time_series_cv,
            resources={"cpu": int(os.cpu_count() * 0.8), "gpu": 1}
        ),
        param_space=search_space,
        tune_config=tune.TuneConfig(
            search_alg=optuna_search,
            num_samples=6,
            max_concurrent_trials=2,
        ),
        run_config=ray.air.RunConfig(
            name=PROJECT_NAME,
            storage_path=result_dir,
            callbacks=[wandb_callback],
        ),
    )

    results = tuner.fit()
else:
    restored_tuner = tune.Tuner.restore(
        trainable=train_with_time_series_cv
    )
    results = restored_tuner.get_results()

best_result = results.get_best_result(metric="r2", mode="max")
print("Best config:", best_result.config)
print("Best R2 Score:", best_result.metrics["r2"])


Best config: {'n_lgb_xgb': [3, 2], 'lgb_1__learning_rate': 0.023155522539548737, 'lgb_1__num_leaves': 90, 'lgb_1__feature_fraction': 0.8253141645722654, 'lgb_1__extra_trees': False, 'lgb_2__learning_rate': 0.05942631254908533, 'lgb_2__num_leaves': 67, 'lgb_2__feature_fraction': 0.976443782221655, 'lgb_2__extra_trees': False, 'lgb_3__learning_rate': 0.011781883031179077, 'lgb_3__num_leaves': 36, 'lgb_3__feature_fraction': 0.8637813863872678, 'lgb_3__extra_trees': True, 'lgb_4__learning_rate': 0.006550833574950993, 'lgb_4__num_leaves': 31, 'lgb_4__feature_fraction': 0.8843708965935843, 'lgb_4__extra_trees': True, 'lgb_5__learning_rate': 0.032224080277769934, 'lgb_5__num_leaves': 126, 'lgb_5__feature_fraction': 0.8154497206585487, 'lgb_5__extra_trees': True, 'xgb_1__learning_rate': 0.020350451121469607, 'xgb_1__max_depth': 6, 'xgb_1__subsample': 0.8230163049324326, 'xgb_1__colsample_bytree': 1.0, 'xgb_2__learning_rate': 0.0011881006804406657, 'xgb_2__max_depth': 14, 'xgb_2__subsample': 0.