In [1]:
import os, sys, json, time, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
# Paths (adjust if your layout differs)
ROOT = Path('/home/pduce/ICAIF_2025_Cryptocurrency_Forecasting_Starter_Kit')
DATA = ROOT / "data"
SRC  = ROOT / "src"
SUBM = ROOT / "sample_submission"

# Ensure src is importable
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

# Create sample_submission dir if missing
SUBM.mkdir(parents=True, exist_ok=True)

SEED = 1337
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cpu'

In [3]:
# Load dataset files
info_path = DATA / "dataset_info.json"
if info_path.exists():
    info = json.loads(info_path.read_text(encoding="utf-8"))
    print("dataset_info.json loaded. Keys:", list(info.keys()))
    print(json.dumps({k: info[k] for k in ['features','input_len','horizon_len','outputs']}, indent=2))
else:
    print("dataset_info.json not found at", info_path)

# Peek train / x_test
train_path = DATA / "train.pkl"
x_test_path  = DATA / "x_test.pkl"
y_local_path = DATA / "y_test_local.pkl"

train = pd.read_pickle(train_path)
train['event_time'] = pd.to_datetime('2024-01-01') + train['time_step']*pd.Timedelta(minutes=1) 
x_test  = pd.read_pickle(x_test_path)
y_test_local = pd.read_pickle(y_local_path)

print("train shape:", train.shape, "| columns:", train.columns.tolist())
print("x_test  shape:", x_test.shape,  "| columns:", x_test.columns.tolist())
print("y_test_local shape:", y_test_local.shape, "| columns:", y_test_local.columns.tolist())

display(train.head(3))
display(x_test.head(3))
display(y_test_local.head(3))

dataset_info.json loaded. Keys: ['freq', 'features', 'input_len', 'horizon_len', 'dtypes', 'outputs', 'sha256']
{
  "features": [
    "close",
    "volume"
  ],
  "input_len": 60,
  "horizon_len": 10,
  "outputs": {
    "train": {
      "columns": [
        "series_id",
        "time_step",
        "close",
        "volume"
      ]
    },
    "x_test": {
      "columns": [
        "window_id",
        "time_step",
        "close",
        "volume"
      ]
    },
    "y_test_local": {
      "columns": [
        "window_id",
        "time_step",
        "close"
      ]
    }
  }
}
train shape: (18331224, 5) | columns: ['series_id', 'time_step', 'close', 'volume', 'event_time']
x_test  shape: (3000000, 4) | columns: ['window_id', 'time_step', 'close', 'volume']
y_test_local shape: (20, 3) | columns: ['window_id', 'time_step', 'close']


Unnamed: 0,series_id,time_step,close,volume,event_time
0,1,0,0.137,171985.703125,2024-01-01 00:00:00
1,1,1,0.13656,85451.398438,2024-01-01 00:01:00
2,1,2,0.13647,121151.898438,2024-01-01 00:02:00


Unnamed: 0,window_id,time_step,close,volume
0,1,0,0.1126,24976.0
1,1,1,0.1126,0.0
2,1,2,0.1125,2299.0


Unnamed: 0,window_id,time_step,close
0,1,0,0.1131
1,1,1,0.1131
2,1,2,0.113


In [4]:
from keras_sig import jax_gpu_signature

2025-10-27 11:14:48.575368: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-27 11:14:51.101651: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [8]:
# NEW / UPDATED CODE

import copy
import numpy as np
import torch
from torch.utils.data import Dataset
import pandas as pd
from typing import Dict, Any, List, Optional, Iterable, Callable, Tuple
from concurrent.futures import ProcessPoolExecutor, as_completed

# --- your existing imports ---
from icaif.dataset import TrainWindowSampler, TrainWindowSamplerVect
from athenea.stats.regressions import Ridge
from icaif.metrics import evaluate_all_metrics

SEED = 42  # ensure you define this somewhere

class WindowsDataset(Dataset):
    """
    Wrap TrainWindowSampler into a PyTorch Dataset.
    Returns:
      X: (60, 2) float32 -> [close, volume]
      y: (10,)  float32 -> future close
    """
    def __init__(self, df: pd.DataFrame, train_path: str = None, rolling: bool = True, step_size: int = 1, max_samples: int = None):
        self.sampler = TrainWindowSampler(
            df=df,
            train_path=train_path,
            window=70,
            input_len=60,
            horizon_len=10,
            rolling=rolling,
            step_size=step_size,
            seed=SEED,
        )
        # Materialize (optionally capped) for stable batching
        xs, ys = [], []
        for i, (X, y) in enumerate(self.sampler.iter_windows()):
            xs.append(X.astype(np.float32))
            ys.append(y.astype(np.float32))
            if max_samples is not None and (i + 1) >= max_samples:
                break
        self.X = np.stack(xs, axis=0) if xs else np.zeros((0,60,2), dtype=np.float32)
        self.y = np.stack(ys, axis=0) if ys else np.zeros((0,10), dtype=np.float32)

    def __len__(self):  return len(self.X)
    def __getitem__(self, i):
        return torch.from_numpy(self.X[i]), torch.from_numpy(self.y[i])

class WindowsDatasetVect(Dataset):
    """
    Vectorized implementation of the WindowsDataset.

    Returns:
      X: (60, 2) float32 -> [close, volume]
      y: (10,)  float32 -> future close
      time_X: (60,) datetime64[ns] -> event timestamps for X window
      time_y: (10,) datetime64[ns] -> event timestamps for y window
    """
    def __init__(self, df: pd.DataFrame, train_path: str = None, rolling: bool = True, step_size: int = 1, max_samples: int = None):
        self.sampler = TrainWindowSamplerVect(
            df=df,
            train_path=train_path,
            window=70,
            input_len=60,
            horizon_len=10,
            rolling=rolling,
            step_size=step_size,
            seed=SEED,
        )

        all_X_groups = []
        all_y_groups = []
        all_timeX_groups = []
        all_timey_groups = []

        for sid, group_df in self.sampler.groups.items():
            n = len(group_df)
            if n < self.sampler.window:
                continue

            # --- values array (close, volume) ---
            arr = group_df[['close', 'volume']].to_numpy(dtype=np.float32)

            # --- event_time as UTC nanoseconds (int64) for safe vectorization ---
            # works for tz-aware or naive inputs
            event_ns = pd.to_datetime(group_df['event_time'], utc=True).view('int64').to_numpy()

            num_windows = (n - self.sampler.window) // self.sampler.step_size + 1
            if num_windows <= 0:
                continue

            window_len = self.sampler.window
            step = self.sampler.step_size

            # windows for values (shape: num_windows x 70 x 2)
            strides_vals = (step * arr.strides[0], arr.strides[0], arr.strides[1])
            shape_vals = (num_windows, window_len, 2)
            all_windows_view = np.lib.stride_tricks.as_strided(arr, shape=shape_vals, strides=strides_vals)

            # windows for times (shape: num_windows x 70)
            strides_time = (step * event_ns.strides[0], event_ns.strides[0])
            shape_time = (num_windows, window_len)
            time_windows_view_ns = np.lib.stride_tricks.as_strided(event_ns, shape=shape_time, strides=strides_time)

            # split into X / y
            X_group_view = all_windows_view[:, :self.sampler.input_len, :]
            y_group_view = all_windows_view[:, self.sampler.input_len:, 0]

            timeX_group_view_ns = time_windows_view_ns[:, :self.sampler.input_len]
            timey_group_view_ns = time_windows_view_ns[:, self.sampler.input_len:]

            # materialize copies and cast times back to datetime64[ns]
            all_X_groups.append(X_group_view.copy())
            all_y_groups.append(y_group_view.copy())

            all_timeX_groups.append(timeX_group_view_ns.copy().view('datetime64[ns]'))
            all_timey_groups.append(timey_group_view_ns.copy().view('datetime64[ns]'))

        # concat
        if all_X_groups:
            self.X = np.concatenate(all_X_groups, axis=0)
            self.y = np.concatenate(all_y_groups, axis=0)
            self.time_X = np.concatenate(all_timeX_groups, axis=0)
            self.time_y = np.concatenate(all_timey_groups, axis=0)

            if max_samples is not None:
                self.X = self.X[:max_samples]
                self.y = self.y[:max_samples]
                self.time_X = self.time_X[:max_samples]
                self.time_y = self.time_y[:max_samples]
        else:
            self.X = np.zeros((0, self.sampler.input_len, 2), dtype=np.float32)
            self.y = np.zeros((0, self.sampler.horizon_len), dtype=np.float32)
            self.time_X = np.empty((0, self.sampler.input_len), dtype='datetime64[ns]')
            self.time_y = np.empty((0, self.sampler.horizon_len), dtype='datetime64[ns]')

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        # Keep dataloader tensors unchanged; times stay on the object
        return torch.from_numpy(self.X[i]), torch.from_numpy(self.y[i])



def transform(X):
    """
    X: array-like of shape (n_samples, 60, 2)
       [:, :, 0] = prices (close); [:, :, 1] = volumes
    Returns: list[pd.DataFrame], each of shape (n_samples, 1)
    """
    EPS = 1e-12

    # Arrange as (time x samples)
    X_prices  = pd.DataFrame(X[:, :, 0]).T
    X_volumes = pd.DataFrame(X[:, :, 1]).T

    logp    = np.log(X_prices)
    logrets = logp.diff()  # 1-min log returns, time on rows

    # ----- Your original features (named) -----
    avg_lr        = logrets.mean().to_frame()
    sign_change_share = np.sign(logrets).diff().ne(0).sum().to_frame()
    avg_vol_lr = logrets.mul(X_volumes,axis=0).mean().to_frame()
    vd = np.sign(logrets).mul(X_volumes,axis=0).mean().to_frame()
    rv = logrets.pow(2).mean().to_frame()


    features = [
        avg_lr,
        avg_vol_lr,
        vd,
        rv,
    ]

    #sig = jax_gpu_signature(X, depth=3)
    #df_sig = pd.DataFrame(sig)
    #print(df_sig.head())
    return features

from features_compute import build_features_np
def transform_nick(X):
    return [pd.DataFrame(f) for f in build_features_np(X)]

def evaluate(y_pred, X, y_true, time_y):
    df_prices = pd.DataFrame(X[:, :, 0]).T
    y_pred[0] = 0
    y_pred = y_pred.cumsum(axis=1)
    y_pred = y_pred.mul(df_prices.iloc[-1],axis=0)
    submission_df = y_pred.stack().reset_index().rename(columns={0:'pred_close','level_0':'window_id','level_1':'time_step'})

    df_x = pd.DataFrame(
    {
        "window_id": np.repeat(np.arange(X.shape[0]), X.shape[1]),
        "time_step": np.tile(np.arange(X.shape[1]), X.shape[0]),
        "close": X[:,:,0].flatten(),
        "volume": X[:,:,1].flatten()
    })

    df_y = pd.DataFrame(
        {
            "window_id": np.repeat(np.arange(y_true.shape[0]), y_true.shape[1]),
            "time_step": np.tile(np.arange(y_true.shape[1]), y_true.shape[0]),
            "close": y_true.flatten(),
            "event_datetime": time_y.flatten()
        }
    )

    y_true_val = df_y[["window_id", "time_step","close", "event_datetime"]].copy()
    x_val = (
        df_x[df_x["time_step"] == 59]
        [["window_id", "time_step", "close"]]
        .copy()
    )
    
    results = evaluate_all_metrics(
        y_true=y_true_val,
        y_pred=submission_df,
        x_test=x_val,
    )
    return results


# --------------- NEW: worker function ---------------
def _run_one_fold(
    idx: int,
    train_ids: np.ndarray,
    val_ids: np.ndarray,
    df_train: pd.DataFrame,
    model_or_factory: Any,
) -> Tuple[int, str, Dict[str, Any]]:
    """
    Execute a single fold end-to-end and return (idx, fold_name, metrics).
    Runs in a separate process when parallelized.
    """
    # Rebuild model per process
    model = model_or_factory() if callable(model_or_factory) else copy.deepcopy(model_or_factory)

    # Slice data for this fold
    df_tr = df_train[df_train['series_id'].isin(train_ids)].copy()
    df_va = df_train[df_train['series_id'].isin(val_ids)].copy()

    # Defensive: skip degenerate folds
    if df_tr.empty or df_va.empty:
        return idx, f"val_{'-'.join(map(str, val_ids))}", {}

    # Build datasets
    train_ds = WindowsDatasetVect(df_tr, rolling=True, step_size=1)
    val_ds   = WindowsDatasetVect(df_va, rolling=True, step_size=1)

    # Your original training target construction
    df_y_train = np.log(pd.DataFrame(train_ds.y)).diff(axis=1)

    # Fit / predict / evaluate
    train_features = model.transform(train_ds.X)
    val_features = model.transform(val_ds.X)

    preds = []
    for i in range(1,10):
        model.fit(df_y_train[i].to_frame(0),train_features, )
        y_pred = model.predict(val_features).rename(columns={0:i})
        preds.append(y_pred)

    y_pred = pd.concat(preds, axis=1)

    metrics: Dict[str, Any] = model.evaluate(y_pred, val_ds.X, val_ds.y, val_ds.time_y)

    fold_name = f"val_{'-'.join(map(str, val_ids))}"
    return idx, fold_name, metrics


# --------------- UPDATED: run_cv with parallelism ---------------
def run_cv(
    df_train: pd.DataFrame,
    n_train: int = 4,
    n_val: int = 1,
    model=None,
    *,
    include_last_fold: bool = False,   # set True to include the final possible window
    n_jobs: int = 1,                   # NEW: #processes; 1 keeps it sequential
    model_factory: Optional[Callable[[], Any]] = None,  # NEW: pass to rebuild model per fold
) -> pd.DataFrame:
    """
    Rolling group-based CV, optionally parallel across folds (process-based).
    Assumptions about `model` / `model_factory`:
      - Either:
          model_factory() -> fresh model with fit/predict/evaluate/transform
        Or:
          model is a picklable object with those methods (we deep-copy it per fold).
    """
    if (model is None) and (model_factory is None):
        raise ValueError("Provide either `model` or a `model_factory` (callable returning a fresh model).")

    series_ids = df_train['series_id'].unique()
    n_series   = len(series_ids)
    n_total    = n_train + n_val

    if n_total > n_series:
        raise ValueError(f"n_train + n_val must be ≤ number of groups ({n_series}).")
    if n_train < 1:
        raise ValueError(f"n_train must be ≥ 1 (got {n_train}).")
    if n_val < 1:
        raise ValueError(f"n_val must be ≥ 1 (got {n_val}).")
    if n_train < n_val:
        raise ValueError(f"n_train must be ≥ n_val (got {n_train} < {n_val}).")

    start = n_total
    stop  = n_series + (1 if include_last_fold else 0)

    # Build fold definitions once
    fold_specs = []
    for i in range(start, stop):
        train_ids = series_ids[i - n_total : i - n_val]
        val_ids   = series_ids[i - n_val   : i]
        fold_specs.append((i, train_ids, val_ids))

    if not fold_specs:
        raise RuntimeError("No folds were produced. Check your data and parameters.")

    # Choose what to pass to workers for model creation
    model_or_factory = model_factory if model_factory is not None else model

    # Sequential path (n_jobs == 1)
    if n_jobs == 1:
        results = [
            _run_one_fold(idx, train_ids, val_ids, df_train, model_or_factory)
            for (idx, train_ids, val_ids) in fold_specs
        ]
    else:
        # Parallel path (processes)
        # Tip: to avoid CPU over-subscription with BLAS, consider setting env vars:
        # OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 NUMEXPR_NUM_THREADS=1
        results = [None] * len(fold_specs)
        with ProcessPoolExecutor(max_workers=n_jobs) as ex:
            futures = {
                ex.submit(_run_one_fold, idx, train_ids, val_ids, df_train, model_or_factory): pos
                for pos, (idx, train_ids, val_ids) in enumerate(fold_specs)
            }
            for fut in as_completed(futures):
                pos = futures[fut]
                results[pos] = fut.result()

    # results: list of (idx, fold_name, metrics)
    # Keep chronological order by fold index
    results.sort(key=lambda t: t[0])
    fold_names = [name for _, name, _ in results]
    metric_rows = [metrics for _, _, metrics in results]

    # Assemble DataFrame: rows = metric names, cols = folds
    metrics_df = pd.DataFrame(metric_rows, index=fold_names).T
    return metrics_df


In [16]:
from athenea.stats.regressions import Ridge
def make_ridge(l2=0.1):
    model = Ridge(l2=l2)
    model.transform = transform_nick
    model.evaluate = evaluate
    return model


In [20]:

metrics = run_cv(
    df_train=train[train['series_id']<=2],
    n_train=1,
    n_val=1,
    include_last_fold=True,
    n_jobs=1,                 # ← parallel across 8 processes
    model_factory=make_ridge(l2=1), # ← safe across processes
)

  event_ns = pd.to_datetime(group_df['event_time'], utc=True).view('int64').to_numpy()
  event_ns = pd.to_datetime(group_df['event_time'], utc=True).view('int64').to_numpy()


In [21]:
metrics

Unnamed: 0,val_2
MSE,0.306488
MAE,0.533383
IC,0.0
IR,0.0
SharpeRatio,0.0
MDD,0.0
VaR,0.0
ES,0.0


In [14]:
metrics

Unnamed: 0,val_2
MSE,0.306488
MAE,0.533383
IC,0.0
IR,0.0
SharpeRatio,0.0
MDD,0.0
VaR,0.0
ES,0.0


In [12]:
model = make_ridge()
train_ds = WindowsDatasetVect(train, rolling=True, step_size=1)
train_features = model.transform(train_ds.X)

        0              1             2            3            4   \
0 -0.00143  -99103.968750  1.022435e-06  1136.216064  -994.498352   
1 -0.00094   28114.046875  4.417954e-07  1072.388184 -1098.815308   
2 -0.00129   56385.750000  8.320607e-07  1004.861084 -1077.599243   
3 -0.00176  -16793.250000  1.548811e-06   912.215576  -882.659302   
4 -0.00181 -232696.343750  1.638049e-06  1269.294556  -848.114197   

             5             6         7         8            9         10  \
0  4.910777e+09 -4.873573e-10 -1.679209  1.733641 -542296640.0 -0.155759   
1  3.952394e+08 -1.384283e-10 -1.167127  1.326214 -497052896.0 -0.146667   
2  1.589812e+09 -3.577887e-10 -1.033422  0.770564 -430420672.0  0.309774   
3  1.410806e+08 -9.086391e-10 -1.409842  1.214179 -378141312.0  0.169654   
4  2.707372e+10 -9.882888e-10 -1.942987  1.588552 -580836096.0 -0.026733   

             11           12            13  
0  9.719899e+08 -436715488.0 -1.618831e+14  
1  1.024255e+09 -527573632.0  3.724743

In [13]:
test_features = model.transform(x_test[['close','volume']].values.reshape(50000,60,2))

        0              1             2             3            4   \
0  0.00050   73841.000000  1.250005e-07     22.550888    14.369687   
1 -0.00096   89725.554688  4.608034e-07     -4.543301   -81.593529   
2  0.00019 -460473.000000  1.805080e-08  -7080.743164  6993.251465   
3  0.01489  450706.750000  1.108563e-04  13484.589844 -6773.553711   
4  0.00370   -4099.729980  6.844832e-06    180.764969  -195.933838   

             5             6           7          8             9          10  \
0  2.726247e+09  2.083346e-11   -0.000733   0.012742  2.346698e+06  -0.002778   
1  4.025340e+09 -1.474576e-10   -0.000956   0.006275 -4.293116e+06   0.036028   
2  1.060110e+11  1.143594e-12   -6.427595  11.509821  2.708342e+10  -5.090534   
3  1.015686e+11  5.502174e-07  123.076736 -45.367538 -3.529163e+09 -27.745413   
4  8.403826e+06  8.441710e-09   -1.853161   4.375144 -2.732600e+06  -2.550046   

             11            12            13  
0 -3.028219e+06  2.044646e+06  6.710408e+13  


In [14]:
df_y_train = np.log(pd.concat([
    pd.Series(train_ds.X[:,-1,0]),
    pd.DataFrame(train_ds.y),
],axis=1)).diff(axis=1).dropna(how='all',axis=1)

preds = []
all_models = []
for i in range(0,10):
    model = make_ridge()
    model.fit(df_y_train[i].to_frame(0),train_features)
    all_models.append(model)
    y_pred = model.predict(test_features).rename(columns={0:i})
    preds.append(y_pred)

y_pred = pd.concat(preds, axis=1)
y_pred_prices = np.exp(y_pred.cumsum(axis=1)).mul(x_test[x_test['time_step']==59]['close'].reset_index(drop=True),axis=0)

In [15]:
import pickle
with open('/home/pduce/ICAIF_2025_Cryptocurrency_Forecasting_Starter_Kit/submissions/ridge/model_weights.pkl', 'wb') as f:
    f.write(pickle.dumps(all_models[0]))

In [16]:
submission_df = y_pred_prices.stack().reset_index().rename(columns={'level_0':'window_id','level_1':'time_step',0:'pred_close'})
submission_df['window_id'] += 1 


In [17]:
import pickle
with open('/home/pduce/ICAIF_2025_Cryptocurrency_Forecasting_Starter_Kit/submissions/ridge/submission.pkl', 'wb') as f:
    pickle.dump(submission_df, f)

In [18]:
metrics.median(axis=1)

MSE            1.169836e+02
MAE            3.896745e+00
IC             1.022314e-03
IR             1.558027e-01
SharpeRatio    1.455284e+07
MDD            0.000000e+00
VaR            1.455284e-05
ES             1.455284e-05
dtype: float64