## Code skeleton example for PriorLabs Fine-Tuning Program

In [1]:
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Callable, Dict, List, Tuple
import warnings

import numpy as np
import xgboost as xgb

In [2]:
from src.utils import  benchmark_datasets

#### 🧪  Minimal Example Implementation (synthetic data)

In [3]:
from src.data_processing.base_datamodule import BaseDataModule

class ExampleDataModule(BaseDataModule):
    """Synthetic placeholder so the skeleton runs out‑of‑the‑box."""

    def __init__(
        self,
        n_datasets: int = 3,
        n_samples: int = 5000,
        n_features: int = 100,
        random_seed: int = 42,
    ):
        super().__init__(random_seed)
        self.n_datasets = n_datasets
        self.n_samples = n_samples
        self.n_features = n_features

    # 🚧  Replace `_make_dataset` + the *_datasets methods with real logic
    def _make_dataset(self, rng) -> Tuple[np.ndarray, np.ndarray]:
        X = rng.normal(size=(self.n_samples, self.n_features))
        y = (X[:, 0] + rng.normal(scale=0.5, size=self.n_samples) > 0).astype(int)
        return X, y

    def _generate(self) -> Tuple[List[np.ndarray], List[np.ndarray]]:
        rng = np.random.default_rng(self.random_seed)
        X_list, y_list = zip(*(self._make_dataset(rng) for _ in range(self.n_datasets)))
        return list(X_list), list(y_list)

    def train_datasets(self):
        return self._generate()

    def val_datasets(self):
        # Optional: supply explicit validation sets or just return empty lists.
        # We will sample from the training datasets in case None are provided!
        return [], []

    def test_datasets(self):
        return self._generate()

#### 🚀  Quick sanity check: classification

In [4]:
dm = ExampleDataModule()
X_train_list, y_train_list = dm.train_datasets()

model = xgb.XGBClassifier(
    eval_metric="logloss", random_state=42, n_estimators=200,
)

scores = benchmark_datasets(model, X_train_list, y_train_list)
print("Mean scores across synthetic datasets:")
for name, val in scores.items():
    print(f"  {name:8s}: {val:.4f}")

Datasets: 100%|██████████| 3/3 [00:02<00:00,  1.47it/s]

Mean scores across synthetic datasets:
  MetricType.ACCURACY: 0.8467
  MetricType.ROC_AUC: 0.9276
  MetricType.F1: 0.8466
  MetricType.LOG_LOSS: 0.5026





#### 🚀  Quick sanity check: regression

In [5]:
dm = ExampleDataModule()
X_train_list, y_train_list = dm.train_datasets()

model = xgb.XGBRFRegressor(
    eval_metric="logloss", random_state=42, n_estimators=200,
)

scores = benchmark_datasets(model, X_train_list, y_train_list)
print("Mean scores across synthetic datasets:")
for name, val in scores.items():
    print(f"  {name:8s}: {val:.4f}")

Datasets:   0%|          | 0/3 [00:00<?, ?it/s]

Datasets: 100%|██████████| 3/3 [00:02<00:00,  1.11it/s]

Mean scores across synthetic datasets:
  MetricType.RMSE: 0.3192
  MetricType.MSE: 0.1019
  MetricType.MAE: 0.2056



