# Regression Embedding Prep Smoke Test

Quick notebook to verify `experiments.regression.training_data_prep`:
- load MegaMedical task
- encode to embeddings
- sanity check shapes
- optional batch-size timing


In [26]:
from pathlib import Path
import sys
import time
import numpy as np
import pandas as pd

REPO_ROOT = Path('/data/ddmg/mvseg-ordering')
if str(REPO_ROOT) not in sys.path:
    sys.path.append(str(REPO_ROOT))

from experiments.regression.training_data_prep import (
    build_embedding_training_data,
    load_megamedical_dataset,
)

from experiments.regression.regression_model import RegressionModel


In [27]:
# --- Config ---
DATASET_TARGET = 0
SPLIT = 'train'
DATASET_SIZE = 16

# Swap encoder config as needed:
# experiments/encoder_configs/multiverseg_default.yaml
# experiments/encoder_configs/clip_default.yaml
# experiments/encoder_configs/vit_default.yaml
# experiments/encoder_configs/dinov2_default.yaml
# experiments/encoder_configs/medsam_default.yaml
ENCODER_CFG_PATH = REPO_ROOT / 'experiments/encoder_configs/medsam_default.yaml'

DEVICE = 'cpu'
BATCH_SIZE = 16


In [28]:
dataset = load_megamedical_dataset(
    dataset_target=DATASET_TARGET,
    split=SPLIT,
    dataset_size=DATASET_SIZE,
)
print('Dataset length:', len(dataset))


No updates to index
Filtered task_df: 1248
got task df: 1248


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["label_type"].fillna("soft", inplace=True)


target_datasets: 1248
Dataset length: 16


In [29]:
data = build_embedding_training_data(
    dataset=dataset,
    encoder_cfg_path=ENCODER_CFG_PATH,
    device=DEVICE,
    batch_size=BATCH_SIZE,
)

print('data_indices shape:', data.data_indices.shape)
print('embeddings shape:', data.embeddings.shape)
print('dtype:', data.embeddings.dtype)

assert data.embeddings.ndim == 2
assert data.embeddings.shape[0] == data.data_indices.shape[0]

pd.DataFrame(data.embeddings[:5]).head()


  state = torch.load(self.checkpoint_path, map_location=torch.device("cpu"))


Batch Tensor Shape torch.Size([16, 1, 128, 128])
Batch Embeddings Shape torch.Size([16, 512])
data_indices shape: (16,)
embeddings shape: (16, 512)
dtype: float32


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.004055,-0.036124,0.009003,-0.004069,0.014717,-0.001177,0.001576,0.005607,0.034027,0.003028,...,0.034038,0.002839,0.005864,0.083753,0.040922,-0.002343,0.090615,0.033904,0.050029,0.043783
1,-0.004016,-0.037939,0.009438,-0.003101,0.015652,-0.00221,0.001419,0.004006,0.034052,0.003599,...,0.029093,0.002938,0.005478,0.084383,0.050693,-0.002577,0.094976,0.028655,0.0539,0.044005
2,-0.003939,-0.036336,0.009604,-0.002679,0.016442,-0.00266,0.000712,0.004072,0.035459,0.003564,...,0.020037,0.002921,0.005964,0.085536,0.037694,-0.002686,0.09895,0.031436,0.053855,0.04689
3,-0.004111,-0.035872,0.009907,-0.002233,0.015869,-0.003041,0.000953,0.004712,0.035336,0.003544,...,0.023143,0.003391,0.006828,0.085576,0.032298,-0.002452,0.092521,0.028952,0.060496,0.046844
4,-0.003604,-0.035998,0.009361,-0.003722,0.015567,-0.002233,0.000408,0.003535,0.034649,0.003418,...,0.041182,0.002955,0.005625,0.085057,0.032658,-0.002659,0.082764,0.02729,0.059032,0.046329


In [30]:
# Optional: batch-size timing on the same dataset
batch_sizes = [1, 4, 16, 32]
rows = []
for bs in batch_sizes:
    t0 = time.perf_counter()
    _ = build_embedding_training_data(
        dataset=dataset,
        encoder_cfg_path=ENCODER_CFG_PATH,
        device=DEVICE,
        batch_size=bs,
    )
    dt = time.perf_counter() - t0
    rows.append({'batch_size': bs, 'seconds': dt})

timing_df = pd.DataFrame(rows).sort_values('seconds')
timing_df


  state = torch.load(self.checkpoint_path, map_location=torch.device("cpu"))


Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Size([1, 512])
Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Size([1, 512])
Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Size([1, 512])
Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Size([1, 512])
Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Size([1, 512])
Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Size([1, 512])
Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Size([1, 512])
Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Size([1, 512])
Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Size([1, 512])
Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Size([1, 512])
Batch Tensor Shape torch.Size([1, 1, 128, 128])
Batch Embeddings Shape torch.Siz

Unnamed: 0,batch_size,seconds
2,16,95.361744
3,32,97.675867
1,4,98.890304
0,1,99.050415


In [31]:
# Optional: quick regression smoke test on embedding feature 0
rng = np.random.default_rng(0)
x_train = data.embeddings[:, :1]
y_train = x_train[:, 0] + 0.05 * rng.normal(size=x_train.shape[0])

reg = RegressionModel(x_train=x_train, y_train=y_train, lr=1e-2)
losses = reg.train(num_epochs=100, plot_path='figures/regression/regression_from_embeddings.png')


epoch: 1, loss: 0.02220591902732849
epoch: 2, loss: 0.02140306681394577
epoch: 3, loss: 0.020632006227970123
epoch: 4, loss: 0.01989148184657097
epoch: 5, loss: 0.019180282950401306
epoch: 6, loss: 0.018497245386242867
epoch: 7, loss: 0.017841259017586708
epoch: 8, loss: 0.017211250960826874
epoch: 9, loss: 0.01660618744790554
epoch: 10, loss: 0.016025088727474213
epoch: 11, loss: 0.015466999262571335
epoch: 12, loss: 0.014931010082364082
epoch: 13, loss: 0.014416247606277466
epoch: 14, loss: 0.013921871781349182
epoch: 15, loss: 0.013447071425616741
epoch: 16, loss: 0.012991074472665787
epoch: 17, loss: 0.012553136795759201
epoch: 18, loss: 0.012132539413869381
epoch: 19, loss: 0.011728598736226559
epoch: 20, loss: 0.011340653523802757
epoch: 21, loss: 0.010968071408569813
epoch: 22, loss: 0.010610243305563927
epoch: 23, loss: 0.010266587138175964
epoch: 24, loss: 0.009936538524925709
epoch: 25, loss: 0.009619560092687607
epoch: 26, loss: 0.009315134026110172
epoch: 27, loss: 0.009022