In [None]:
experiment_name = 'Clip16_NIGHTS_Lora_Default_123_First'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import logging
import os
import sys
import yaml
import wandb

# Project setup
PROJECT_ROOT = '/content/drive/MyDrive/perceptual-vits-fashion-forecasting'
sys.path.append(os.path.join(PROJECT_ROOT, 'src'))

# Load config
config_path = os.path.join(PROJECT_ROOT, 'configs', f"{experiment_name}.yaml")
if not os.path.isfile(config_path):
    raise FileNotFoundError(f"Config file not found: {config_path}")

with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

# Logging
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')

Mounted at /content/drive


In [None]:
# Log into WnadB
with open(os.path.join(PROJECT_ROOT, 'wandb_key.txt')) as file:
    key = file.read().strip()

wandb.login(key=key)

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnils-grossepieper[0m ([33mnils-grossepieper-eberhard-karls-universit-t-t-bingen[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# Collect Variables for WandB run
model_family = config['vision_model']['model_family']
if model_family == 'cnn':
  vision_model_training_name = config['vision_model']['cnn']['vision_model_training_name']
  model_type = config['vision_model']['model_type']
  training_method = config['vision_model']['training_method']
  dataset_name = config['vision_model']['dataset_name']
  run_name = f"{vision_model_training_name}_{model_type}_{training_method}_{dataset_name}_experiment_parameters"
  job_type = 'Cnn training parameter'
  tags=[vision_model_training_name, 'cnn', model_type, training_method, dataset_name]
  notes=config['vision_model']['cnn']['wandb_notes']

elif model_family == 'vit':
  model_token = '+'.join(
  m.split('_vit')[0].replace('open_clip', 'openclip')
  for m in config['vision_model']['model_type'].split(',')
  )
  vision_model_training_name = config['vision_model']['vit']['vision_model_training_name']
  training_method = config['vision_model']['training_method']
  dataset_name = config['vision_model']['dataset_name']
  run_name = f"{vision_model_training_name}_{model_token}_{training_method}_{dataset_name}_experiment_parameters"
  job_type = 'Vision transformer training parameter'
  tags=[vision_model_training_name, 'vision_transformer', model_token, training_method, dataset_name]
  notes=config['vision_model']['vit']['wandb_notes']

else:
  raise ValueError('Unknown model family.')

# Log the config files as artefact in WandB
with wandb.init(
    project=config['vision_model']['wandb_project'],
    name=run_name,
    group=vision_model_training_name,
    job_type='experiment_parameters',
    tags=tags,
    notes=notes
) as run:
    art = wandb.Artifact(
        name=f"{run_name}_config",
        type='config',
        description='Configuration file for vision model training'
    )
    art.add_file(config_path, name='config.yaml')
    run.log_artifact(art)

In [None]:
# Inspect GPU
!nvidia-smi

# (Optional) Remove conflicting installs from previous sessions
!pip -q uninstall -y tensorflow tensorflow-gpu keras jax jaxlib \
  torch torchvision torchaudio nvidia-cudnn-cu12 nvidia-cublas-cu12 pillow || true

# Install a CUDA-matched PyTorch FIRST (Colab usually uses CUDA 12.1) ---
!pip -q install --index-url https://download.pytorch.org/whl/cu121 \
  torch torchvision torchaudio

# Install DreamSim deps WITHOUT pulling another torch build
# (Everything here excludes torch/vision/torchaudio)
!pip -q install \
  configargparse dists-pytorch lpips numpy pandas pytorch-lightning \
  PyYAML scipy tensorboard timm torchmetrics tqdm transformers \
  open-clip-torch peft>=0.2.0

# Pillow upgrade (fix ImportError: is_directory)
!pip -q install 'pillow>=10.1.0'

# OpenAI CLIP (git)
!pip -q install git+https://github.com/openai/CLIP.git

# Dinov3
!pip install timm

# (Optional) Tame CUDA allocator fragmentation for long sessions
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128'

Thu Oct 30 16:16:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   34C    P0             51W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Make destination folder
!mkdir -p /content/datasets/nights

# Set the working directory
%cd /content/datasets

# Copy zip from Google Drive to local disk
!cp /content/drive/MyDrive/perceptual-vits-fashion-forecasting/datasets/NIGHTS.zip nights.zip

# Unzip into /content/datasets/nights
!unzip -qo nights.zip -d nights

# Remove the local copy of the zip to save space
!rm nights.zip

# Check the space used
!du -sh nights


/content/datasets
58G	nights


In [None]:
# Make destination folder
!mkdir -p /content/datasets/nights

# Set the working directory
%cd /content/datasets

# Download and unzip the data -> /content/datasets/nights
!wget -O nights.zip https://data.csail.mit.edu/nights/nights.zip
!unzip -qo nights.zip
!rm /content/datasets/nights.zip

# Check the space
!du -sh nights

/content/datasets
--2025-10-30 16:42:50--  https://data.csail.mit.edu/nights/nights.zip
Resolving data.csail.mit.edu (data.csail.mit.edu)... 128.52.131.233
Connecting to data.csail.mit.edu (data.csail.mit.edu)|128.52.131.233|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 61475483837 (57G) [application/zip]
Saving to: ‚Äònights.zip‚Äô


2025-10-30 17:21:48 (25.1 MB/s) - ‚Äònights.zip‚Äô saved [61475483837/61475483837]

115G	nights


In [None]:
# make destination folder
!mkdir -p /content/datasets/fashion_triplets

# copy everything from Drive into local -> /content/datasets/fashion_triplets
!rsync -ah --info=progress2 \
  "/content/drive/MyDrive/perceptual-vits-fashion-forecasting/datasets/fashion_triplets/" \
  "/content/datasets/fashion_triplets/"

         56.97M 100%    3.13MB/s    0:00:17 (xfr#809, to-chk=0/811)


In [None]:
def get_num_workers():
    try:
        cpus = len(os.sched_getaffinity(0))
    except AttributeError:
        cpus = os.cpu_count() or 2
    workers = max(1, min(16, (cpus * 3) // 4))  # ~75% of CPUs, capped at 16
    print(f"Detected {cpus} CPUs ‚Üí suggested num_workers = {workers}")
    return workers

num_workers = get_num_workers()

Detected 12 CPUs ‚Üí suggested num_workers = 9


In [None]:
from types import SimpleNamespace
import torch
from vision_models.dreamsim.training.train import run as train_vit
from vision_models.train_cnn.train import run as train_cnn


if model_family == 'vit':
  # define vision-transformer variables
  args = SimpleNamespace(
      # run/meta
      seed=config['vision_model']['vision_model_seed'],
      tag=config['vision_model']['tag'],
      project_root=config['global']['project_root'],
      log_dir=config['vision_model']['vit']['log_dir'],
      save_mode=config['vision_model']['vit']['save_mode'],  # {'adapter_only','entire_model','all'}
      wandb_project=config['vision_model']['wandb_project'],
      vision_model_training_name=config['vision_model']['vit']['vision_model_training_name'],
      wandb_notes=config['vision_model']['vit']['wandb_notes'],

      # model
      model_type=config['vision_model']['model_type'],
      feat_type=config['vision_model']['vit']['feat_type'],
      stride=config['vision_model']['vit']['stride'],
      use_lora=config['vision_model']['vit']['use_lora'],
      hidden_size=config['vision_model']['hidden_size'],   # ignored when use_lora=True
      normalize_embeds=config['vision_model']['vit']['normalize_embeds'],
      load_size=config['vision_model']['vit']['load_size'],

      # data
      dataset_root=config['vision_model']['vit']['dataset_root'].strip(),
      second_dataset_root=(
        config['vision_model']['vit']['second_dataset_root'].strip()
        if config['vision_model']['vit']['second_dataset_root'] is not None else None
      ),
      dataset_name=config['vision_model']['dataset_name'],
      num_workers=num_workers,

      # training
      lr=config['vision_model']['vit']['lr'],
      weight_decay=config['vision_model']['vit']['weight_decay'],
      batch_size=config['vision_model']['vit']['batch_size'],
      epochs=config['vision_model']['vit']['epochs'],
      margin=config['vision_model']['vit']['margin'],
      patience=config['vision_model']['vit']['patience'],
      min_delta=config['vision_model']['vit']['min_delta'],

      # LoRA
      lora_r=config['vision_model']['vit']['lora_r'],
      lora_alpha=config['vision_model']['vit']['lora_alpha'],
      lora_dropout=config['vision_model']['vit']['lora_dropout'],

      # Safety
      auto_save=2,
      load_path=None,
      load_lora_epoch=None
  )

  train_vit(args)

elif model_family == 'cnn':
    # define vision-transformer variables here
  args = SimpleNamespace(
      # run/meta
      seed=config['vision_model']['vision_model_seed'],
      tag=config['vision_model']['tag'],
      project_root=config['global']['project_root'],
      log_dir=config['vision_model']['cnn']['log_dir'],
      wandb_project=config['vision_model']['wandb_project'],
      vision_model_training_name=config['vision_model']['cnn']['vision_model_training_name'],
      wandb_notes=config['vision_model']['cnn']['wandb_notes'],

      # model
      model_type=config['vision_model']['model_type'],
      mlp=config['vision_model']['cnn']['mlp'],
      hidden_size=config['vision_model']['hidden_size'],
      normalize_embeds=config['vision_model']['cnn']['normalize_embeds'],
      load_size=config['vision_model']['cnn']['load_size'],

      # data
      dataset_root=config['vision_model']['cnn']['dataset_root'].strip(),
      second_dataset_root=(
        config['vision_model']['cnn']['second_dataset_root'].strip()
        if config['vision_model']['cnn']['second_dataset_root'] is not None else None
      ),
      dataset_name=config['vision_model']['dataset_name'],
      num_workers=num_workers,

      # training
      lr=config['vision_model']['cnn']['lr'],
      weight_decay=config['vision_model']['cnn']['weight_decay'],
      batch_size=config['vision_model']['cnn']['batch_size'],
      epochs=config['vision_model']['cnn']['epochs'],
      margin=config['vision_model']['cnn']['margin'],
      patience=config['vision_model']['cnn']['patience'],
      min_delta=config['vision_model']['cnn']['min_delta'],

      # Safety
      auto_save=2,
      load_path=None
  )

  train_cnn(args)

else:
  raise ValueError(f"Unknown model type: {model_type}")

[INFO] NumExpr defaulting to 12 threads.
[INFO] Seed set to 123
[INFO] Arguments: {'seed': 123, 'tag': 'Clip16_DreamSim_Lora_First_Model', 'project_root': '/content/drive/MyDrive/perceptual-vits-fashion-forecasting', 'log_dir': '/vision_models/vits_training', 'save_mode': 'adapter_only', 'wandb_project': 'vision-model-training', 'vision_model_training_name': 'Clip16_DreamSim_Lora_First_Model', 'wandb_notes': 'Training run for the Lora Clip16 model on NIGHTS data DreamSim style.', 'model_type': 'clip_vitb16', 'feat_type': 'embedding', 'stride': '16', 'use_lora': True, 'hidden_size': 512, 'normalize_embeds': True, 'load_size': 224, 'dataset_root': '/content/datasets/nights/', 'second_dataset_root': None, 'dataset_name': 'nights', 'num_workers': 9, 'lr': 0.0003, 'weight_decay': 0.0, 'batch_size': 32, 'epochs': 8, 'margin': 0.05, 'patience': 10, 'min_delta': 1.0, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.2, 'auto_save': 2, 'load_path': None, 'load_lora_epoch': None}
[INFO] [RUN] ex

[INFO] Using 16bit Automatic Mixed Precision (AMP)
[INFO] üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
[INFO] GPU available: True (cuda), used: True
[INFO] TPU available: False, using: 0 TPU cores
[INFO] HPU available: False, using: 0 HPUs
[INFO] [PL] accelerator=gpu precision=16-mixed root_device=cuda:0 | cuda=True count=1
[INFO] Using save mode: adapter_only


[CUDA] NVIDIA A100-SXM4-80GB


[INFO] Training
[INFO] You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


Total params: 86390016 | Trainable params: 589824 | % Trainable: 0.6827455617093531


[INFO] LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/utilities/model_summary/model_summary.py:231: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
[INFO] 
  | Name             | Type      | Params | Mode 
-------------------------------------------------------
0 | perceptual_model | PeftModel | 86.4 M | train
1 | criterion        | HingeLoss | 0      | train
-------------------------------------------------------
589 K     Trainable params
85.8 M    Non-trainable params
86.4 M    Total params
345.560   Total estimated model params size (MB)
127       Modules in train mode
175       Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[INFO] `Trainer.fit` stopped: `max_epochs=8` reached.


Done :)
