In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, json, time, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
# Paths (adjust if your layout differs)
ROOT = Path('/home/pduce/ICAIF_2025_Cryptocurrency_Forecasting_Starter_Kit')
DATA = ROOT / "data"
SRC  = ROOT / "src"
SUBM = ROOT / "sample_submission"

# Ensure src is importable
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

# Create sample_submission dir if missing
SUBM.mkdir(parents=True, exist_ok=True)

SEED = 1337
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cpu'

In [3]:
# Load dataset files
info_path = DATA / "dataset_info.json"
if info_path.exists():
    info = json.loads(info_path.read_text(encoding="utf-8"))
    print("dataset_info.json loaded. Keys:", list(info.keys()))
    print(json.dumps({k: info[k] for k in ['features','input_len','horizon_len','outputs']}, indent=2))
else:
    print("dataset_info.json not found at", info_path)

# Peek train / x_test
train_path = DATA / "train.pkl"
x_test_path  = DATA / "x_test.pkl"
y_local_path = DATA / "y_test_local.pkl"

train = pd.read_pickle(train_path)
train['event_datetime'] = pd.to_datetime('2024-01-01') + train['time_step']*pd.Timedelta(minutes=1) 
x_test  = pd.read_pickle(x_test_path)
y_test_local = pd.read_pickle(y_local_path)

print("train shape:", train.shape, "| columns:", train.columns.tolist())
print("x_test  shape:", x_test.shape,  "| columns:", x_test.columns.tolist())
print("y_test_local shape:", y_test_local.shape, "| columns:", y_test_local.columns.tolist())

display(train.head(3))
display(x_test.head(3))
display(y_test_local.head(3))

dataset_info.json loaded. Keys: ['freq', 'features', 'input_len', 'horizon_len', 'dtypes', 'outputs', 'sha256']
{
  "features": [
    "close",
    "volume"
  ],
  "input_len": 60,
  "horizon_len": 10,
  "outputs": {
    "train": {
      "columns": [
        "series_id",
        "time_step",
        "close",
        "volume"
      ]
    },
    "x_test": {
      "columns": [
        "window_id",
        "time_step",
        "close",
        "volume"
      ]
    },
    "y_test_local": {
      "columns": [
        "window_id",
        "time_step",
        "close"
      ]
    }
  }
}
train shape: (18331224, 5) | columns: ['series_id', 'time_step', 'close', 'volume', 'event_datetime']
x_test  shape: (3000000, 4) | columns: ['window_id', 'time_step', 'close', 'volume']
y_test_local shape: (20, 3) | columns: ['window_id', 'time_step', 'close']


Unnamed: 0,series_id,time_step,close,volume,event_datetime
0,1,0,0.137,171985.703125,2024-01-01 00:00:00
1,1,1,0.13656,85451.398438,2024-01-01 00:01:00
2,1,2,0.13647,121151.898438,2024-01-01 00:02:00


Unnamed: 0,window_id,time_step,close,volume
0,1,0,0.1126,24976.0
1,1,1,0.1126,0.0
2,1,2,0.1125,2299.0


Unnamed: 0,window_id,time_step,close
0,1,0,0.1131
1,1,1,0.1131
2,1,2,0.113


In [89]:
from numpy.lib.stride_tricks import sliding_window_view as swv

X = []
y = []
y_time = []
for i in train['series_id'].unique()[:2]:
    srs_df = train[train['series_id']==i]

    wdw = swv(srs_df[['close','volume']], 70,axis=0).swapaxes(1,2)
    wdw_time = swv(srs_df[['event_datetime']], 70,axis=0).swapaxes(1,2)

    X.append(wdw[:,:60,:])
    y.append(wdw[:,60:,0])
    y_time.append(wdw_time[:,60:,0])
X = np.concatenate(X,axis=0)
y = np.concatenate(y,axis=0)
y_time = np.concatenate(y_time,axis=0)

In [85]:
features = np.diff(np.log(X[:,:,0]),axis=1).mean(axis=1)

In [108]:
pred_prices = np.exp(pd.DataFrame(np.repeat(features[:,np.newaxis],10,axis=1)).cumsum())

In [109]:
pred_prices

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.999822,0.999822,0.999822,0.999822,0.999822,0.999822,0.999822,0.999822,0.999822,0.999822
1,0.999705,0.999705,0.999705,0.999705,0.999705,0.999705,0.999705,0.999705,0.999705,0.999705
2,0.999544,0.999544,0.999544,0.999544,0.999544,0.999544,0.999544,0.999544,0.999544,0.999544
3,0.999325,0.999325,0.999325,0.999325,0.999325,0.999325,0.999325,0.999325,0.999325,0.999325
4,0.999100,0.999100,0.999100,0.999100,0.999100,0.999100,0.999100,0.999100,0.999100,0.999100
...,...,...,...,...,...,...,...,...,...,...
788977,0.336330,0.336330,0.336330,0.336330,0.336330,0.336330,0.336330,0.336330,0.336330,0.336330
788978,0.336277,0.336277,0.336277,0.336277,0.336277,0.336277,0.336277,0.336277,0.336277,0.336277
788979,0.336220,0.336220,0.336220,0.336220,0.336220,0.336220,0.336220,0.336220,0.336220,0.336220
788980,0.336160,0.336160,0.336160,0.336160,0.336160,0.336160,0.336160,0.336160,0.336160,0.336160


In [94]:
pd.DataFrame(y)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.13562,0.13518,0.13517,0.13534,0.13540,0.13569,0.13592,0.13587,0.13578,0.13546
1,0.13518,0.13517,0.13534,0.13540,0.13569,0.13592,0.13587,0.13578,0.13546,0.13547
2,0.13517,0.13534,0.13540,0.13569,0.13592,0.13587,0.13578,0.13546,0.13547,0.13555
3,0.13534,0.13540,0.13569,0.13592,0.13587,0.13578,0.13546,0.13547,0.13555,0.13473
4,0.13540,0.13569,0.13592,0.13587,0.13578,0.13546,0.13547,0.13555,0.13473,0.13457
...,...,...,...,...,...,...,...,...,...,...
788977,0.40740,0.40710,0.40700,0.40730,0.40800,0.40800,0.40780,0.40770,0.40830,0.40860
788978,0.40710,0.40700,0.40730,0.40800,0.40800,0.40780,0.40770,0.40830,0.40860,0.40880
788979,0.40700,0.40730,0.40800,0.40800,0.40780,0.40770,0.40830,0.40860,0.40880,0.40840
788980,0.40730,0.40800,0.40800,0.40780,0.40770,0.40830,0.40860,0.40880,0.40840,0.40840


In [11]:
import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view as swv

class WindowsDatasetVect:
    """
    Vectorized window builder that returns two DataFrames:

    X columns:
      - window_id: integer id of each window
      - time_step: 0..(input_len-1) within the input segment
      - close
      - volume
      - event_datetime

    y columns:
      - window_id: same ids as in X
      - time_step: 0..(horizon_len-1) within the future horizon
      - close: future close
      - prev_close: the last input close (index input_len-1, e.g. 59 when input_len=60)
      - event_datetime: timestamps of the future horizon

    Notes:
      * Requires df to have columns:
          ['series_id','time_step','close','volume','event_datetime']
      * Windows are created per series_id after sorting by time_step.
    """
    def __init__(
        self,
        df : pd.DataFrame = None,
        train_path: str = None,
        window: int = 70,
        input_len: int = 60,
        horizon_len: int = 10,
        rolling: bool = True,
        step_size: int | None = None,
    ) -> None:
        assert input_len + horizon_len == window, "window must equal input_len + horizon_len"
        # default stepping: 1 if rolling, else full non-overlapping windows
        if step_size is None:
            step_size = 1 if rolling else window

        if train_path is not None:
            df = pd.read_pickle(train_path)
        if df is None:
            raise ValueError("Provide either df or train_path")

        required = {'series_id','time_step','close','volume','event_datetime'}
        if not required.issubset(df.columns):
            raise ValueError(f"df missing required columns {required}, found {list(df.columns)}")

        # group per series, sorted by time_step
        groups = {sid: g.sort_values('time_step').reset_index(drop=True)
                  for sid, g in df.groupby('series_id')}

        X_parts: list[pd.DataFrame] = []
        Y_parts: list[pd.DataFrame] = []

        next_win_id = 0

        for _, g in groups.items():
            n = len(g)
            if n < window:
                continue

            close  = g['close' ].to_numpy(np.float32)
            volume = g['volume'].to_numpy(np.float32)
            dt     = g['event_datetime'].to_numpy('datetime64[ns]')

            # sliding windows (shape: (n - window + 1, window))
            w_close  = swv(close,  window_shape=window)[::step_size]
            w_volume = swv(volume, window_shape=window)[::step_size]
            w_dt     = swv(dt,     window_shape=window)[::step_size]

            num_win = w_close.shape[0]
            if num_win == 0:
                continue

            # split into input and horizon
            x_close  = w_close[:,  :input_len]                      # (num_win, input_len)
            x_volume = w_volume[:, :input_len]
            x_dt     = w_dt[:,     :input_len]

            y_close = w_close[:,  input_len:]                       # (num_win, horizon_len)
            y_dt    = w_dt[:,     input_len:]
            prev_c  = x_close[:, -1]                                # (num_win,)

            # window ids for this group
            win_ids = np.arange(next_win_id, next_win_id + num_win, dtype=np.int64)

            # X dataframe chunk
            X_parts.append(pd.DataFrame({
                "window_id":     np.repeat(win_ids, input_len)+1,
                "time_step":     np.tile(np.arange(input_len, dtype=np.int32), num_win),
                "close":         x_close.reshape(-1),
                "volume":        x_volume.reshape(-1),
                "event_datetime": x_dt.reshape(-1),
            }))

            # y dataframe chunk
            Y_parts.append(pd.DataFrame({
                "window_id":     np.repeat(win_ids, horizon_len)+1,
                "time_step":     np.tile(np.arange(horizon_len, dtype=np.int32), num_win),
                "close":         y_close.reshape(-1),
                "prev_close":    np.repeat(prev_c, horizon_len),
                "event_datetime": y_dt.reshape(-1),
            }))

            next_win_id += num_win

        # Public attributes
        if X_parts:
            self.X = pd.concat(X_parts, ignore_index=True)
        else:
            self.X = pd.DataFrame(columns=["window_id","time_step","close","volume","event_datetime"])

        if Y_parts:
            self.y = pd.concat(Y_parts, ignore_index=True)
        else:
            self.y = pd.DataFrame(columns=["window_id","time_step","close","prev_close","event_datetime"])

        self.num_windows = int(self.X["window_id"].max() + 1) if len(self.X) else 0
        self.input_len = input_len
        self.horizon_len = horizon_len
        self.window = window
        self.step_size = step_size

    def __len__(self) -> int:
        return self.num_windows

    def windows(self, window_id: int) -> tuple[pd.DataFrame, pd.DataFrame]:
        """Convenience: return (X_rows, y_rows) for a given window_id."""
        Xw = self.X[self.X["window_id"] == window_id].sort_values("time_step")
        Yw = self.y[self.y["window_id"] == window_id].sort_values("time_step")
        return Xw, Yw


In [17]:
train_ds = WindowsDatasetVect(train[train['series_id']<=2])

X = train_ds.X
X['log_ret'] = np.log(train_ds.X['close']).groupby(train_ds.X['window_id']).diff()
features = X.groupby('window_id')['log_ret'].mean()


In [18]:
y_pred = train_ds.y.copy(deep=True)
y_pred['pred_close'] = np.exp(features.loc[features.index.repeat(10)].groupby(level=0).cumsum().to_frame()).reset_index(drop=True)
y_pred['pred_close'] *= y_pred['prev_close']

In [21]:
y_pred

Unnamed: 0,window_id,time_step,close,prev_close,event_datetime,pred_close
0,1,0,0.13562,0.13557,2024-01-01 01:00:00,0.135546
1,1,1,0.13518,0.13557,2024-01-01 01:01:00,0.135522
2,1,2,0.13517,0.13557,2024-01-01 01:02:00,0.135498
3,1,3,0.13534,0.13557,2024-01-01 01:03:00,0.135474
4,1,4,0.13540,0.13557,2024-01-01 01:04:00,0.135450
...,...,...,...,...,...,...
7889815,788982,5,0.40860,0.40730,2024-09-30 23:55:00,0.406926
7889816,788982,6,0.40880,0.40730,2024-09-30 23:56:00,0.406863
7889817,788982,7,0.40840,0.40730,2024-09-30 23:57:00,0.406801
7889818,788982,8,0.40840,0.40730,2024-09-30 23:58:00,0.406739


In [20]:
from icaif.metrics_np import evaluate_all_metrics_vectorized

evaluate_all_metrics_vectorized(
    y_true=train_ds.y,
    y_pred=y_pred,
    profile=True,
)

({'MSE': 2.378299231550189e-06,
  'MAE': 0.0007472836824129219,
  'IC': -0.13848047255423093,
  'IR': -0.2756082495605737,
  'SharpeRatio': -0.15349564657980103,
  'MDD': 0.999999999999022,
  'VaR': -0.005786984941798949,
  'ES': -0.008736429875581464},
 {'merge+MSE/MAE': 0.570541538996622,
  'prev_price_from_y_true': 0.0003291298635303974,
  'returns': 0.15428444813005626,
  'group_sizes': 0.22473601181991398,
  'ranks': 4.524651050101966,
  'IC/IR': 0.18926621600985527,
  'strategies': 0.23632220295257866,
  'risk/aggregation': 0.04131495812907815,
  'total': 5.941447178833187})

In [61]:
from icaif.metrics_np import evaluate_all_metrics_vectorized

evaluate_all_metrics_vectorized(
    y_true=y_true,
    y_pred=y_pred,
    x_test=df_x,
    profile=True
)

TypeError: evaluate_all_metrics_vectorized() got an unexpected keyword argument 'x_test'

In [23]:
import numba