# Load cached CLIP pre-projection features
Quick notebook to reproduce the cached-feature loading logic from `methods/ProLIP.py` (lines 103–114).

In [7]:
from pathlib import Path
import torch
import numpy as np

# Project root (repo), assuming this notebook lives in `notebooks/`
PROJECT_ROOT = Path("..").resolve()

print(f"Project root: {PROJECT_ROOT}")
print(f"CWD: {Path.cwd()}")

Project root: /home/hshi/Documents/researchproject/aihab/repo/aihab-clip
CWD: /home/hshi/Documents/researchproject/aihab/repo/aihab-clip/notebooks


In [8]:
# Configuration — adjust to match the cached features you have
# backbone: one of ["RN50", "RN101", "ViT-B/16", "ViT-B/32"]
cfg = {
    "backbone": "ViT-B/32",
    "dataset": "cs",    # dataset id used in feature folder name
    "shots": 0,         # number of shots used when caching
    "seed": 1,           # corresponds to `task` in ProLIP
    "aug_views": 1,      # number of augmented views saved
    "root_path": str(PROJECT_ROOT),
    "train_epoch" : 1,    # number of projector-training epochs (cfg['train_epoch'])
    "lr_v" : 1e-5,        # learning rate for Adam on the projector (cfg['lr_v'])
    "lambda_v" : 1.0,     # weight on the MSE regularizer vs. original projection (cfg['lambda_v'])
    "aug_views" : 1,      # how many cached augmentation views you cycle through per epoch (cfg['aug_views'])
    "feat_batch_size" : 0   # 0 => use the entire cached tensor each step (few-shot default)
                           # >0 => chunk the cached features into mini-batches of this size to save memory (full-data mode)
}

def _canonical_backbone_name(backbone: str) -> str:
    if backbone == "ViT-B/16":
        return "ViTB16"
    if backbone == "ViT-B/32":
        return "ViTB32"
    return backbone

backbone_name = _canonical_backbone_name(cfg["backbone"])
feature_dir = Path(cfg["root_path"]) / f"features_{backbone_name}_{cfg['dataset']}" / f"{cfg['shots']}_shot" / f"seed{cfg['seed']}"
print(f"Expecting cached features at: {feature_dir}")

if not feature_dir.exists():
    raise FileNotFoundError(f"Feature directory not found: {feature_dir}\nCheck cfg settings above.")

# Show what's inside for quick sanity check
print("Contents:")
for p in sorted(feature_dir.glob("*")):
    print(" -", p.name)

Expecting cached features at: /home/hshi/Documents/researchproject/aihab/repo/aihab-clip/features_ViTB32_cs/0_shot/seed1
Contents:
 - f0.pth
 - label.pth


In [9]:
# Load labels and filter to in-range classes (mirrors ProLIP lines 103-114)
label_path = feature_dir / "label.pth"
train_labels = torch.load(label_path, weights_only=True)
print("label.pth loaded:", train_labels.shape, train_labels.dtype)

# If you know the exact number of classes, set m directly; here we take a safe upper bound
m = train_labels.max().item() + 1
indices = torch.where(train_labels < m)[0]
train_labels = train_labels[indices]
print("Filtered labels shape:", train_labels.shape)

train_x_before_list = []
for num in range(cfg['aug_views']):
    fpath = feature_dir / f"f{num}.pth"
    feats = torch.load(fpath, weights_only=True)
    feats = feats[indices]
    train_x_before_list.append(feats)
    print(f"Loaded f{num}.pth -> {feats.shape}, dtype={feats.dtype}")

print(f"Total views loaded: {len(train_x_before_list)}")

label.pth loaded: torch.Size([4200]) torch.int64
Filtered labels shape: torch.Size([4200])
Loaded f0.pth -> torch.Size([4200, 768]), dtype=torch.float16
Total views loaded: 1


In [10]:
# Example: inspect a sample vector from the first view
if train_x_before_list:
    print("Sample feature vector length:", train_x_before_list[0].shape)
    sample = train_x_before_list[0][0]
    print("Sample feature vector length:", sample.numel())
    print("First 5 values:", sample[:5])
else:
    print("No views loaded — check cfg['aug_views'] and that f0.pth exists.")

Sample feature vector length: torch.Size([4200, 768])
Sample feature vector length: 768
First 5 values: tensor([ 1.4717,  1.2275, -0.1714,  0.2610,  1.8027], dtype=torch.float16)


# Create a projector, text weights, and config for training


In [11]:
import sys, copy
PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))

from methods.ProLIP import VisProjViT  # imports the class
from utils import clip_classifier
from data.templates import CS_TEMPLATES, CS_CLASSNAMES  # swap if using another dataset
from methods.utils import cls_acc, compute_image_features


In [12]:
import clip  # this repo’s bundled CLIP package
device = "cuda" if torch.cuda.is_available() else "cpu"

state_dict, clip_model, preprocess = clip.load(cfg["backbone"], device=device)
texts, text_weights_before, text_weights = clip_classifier(CS_CLASSNAMES, CS_TEMPLATES, clip_model)
print("text_weights shape:", text_weights.shape) 


vit_proj = state_dict["visual.proj"]
proj = VisProjViT(vit_proj)

# For testing purposes, assumes you already built train_x_before_list as in your notebook
# batch_x = train_x_before_list[0].to(proj.vit_proj.device)
# image_features = proj(batch_x)  # shape: [batch, embed_dim]


text_weights shape: torch.Size([512, 20])


In [13]:
# === ProLIP hyperparameters ===
train_epoch = 200          # number of projector-training epochs (cfg['train_epoch'])
lr_v = cfg["lr_v"]                # learning rate for Adam on the projector (cfg['lr_v'])
lambda_v = cfg["lambda_v"]             # weight on the MSE regularizer vs. original projection (cfg['lambda_v'])
aug_views = cfg["aug_views"]              # how many cached augmentation views you cycle through per epoch (cfg['aug_views'])
feat_batch_size = 64

# Training the projector

In [14]:
import math
import torch.nn.functional as F

device = proj.vit_proj.device  # projector already moved to GPU/CPU
mse = torch.nn.MSELoss(reduction="sum")

# copies needed for the L2 regularizer
vit_proj_copy = proj.vit_proj.detach().clone()

optimizer = torch.optim.Adam(proj.parameters(), lr=lr_v, eps=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_epoch)

print("== Projector training ==")
proj.train()
cnt = 0

for epoch in range(train_epoch):
    correct = 0.0
    total = 0
    loss_ce_hist, loss_mse_hist = [], []

    if (cnt + 1) % aug_views == 0:
        cnt = 0
    else:
        cnt += 1

    train_x_before = train_x_before_list[cnt]
    target = train_labels
    feat_batch_size = cfg.get("feat_batch_size", 0)

    if feat_batch_size and feat_batch_size > 0:
        N = train_x_before.size(0)
        num_chunks = math.ceil(N / feat_batch_size)
        lambda_scaled = lambda_v / float(max(num_chunks, 1))

        for i0 in range(0, N, feat_batch_size):
            i1 = min(i0 + feat_batch_size, N)
            batch_x = train_x_before[i0:i1].to(device, non_blocking=True)
            batch_y = target[i0:i1].to(device, non_blocking=True)

            image_features = proj(batch_x)
            image_features = F.normalize(image_features, dim=-1)
            logits = 100.0 * image_features @ text_weights

            initial_params = vit_proj_copy.view(-1)
            fine_tuned_params = proj.vit_proj.view(-1)

            mse_loss = mse(initial_params, fine_tuned_params)
            loss_ce = F.cross_entropy(logits, batch_y)
            loss = loss_ce + lambda_scaled * mse_loss

            acc = cls_acc(logits, batch_y)
            correct += acc / 100.0 * len(logits)
            total += len(logits)
            loss_ce_hist.append(loss_ce.item())
            loss_mse_hist.append(mse_loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    else:
        batch_x = train_x_before.to(device, non_blocking=True)
        batch_y = target.to(device, non_blocking=True)

        image_features = proj(batch_x)
        image_features = F.normalize(image_features, dim=-1)
        logits = 100.0 * image_features @ text_weights

        initial_params = vit_proj_copy.view(-1)
        fine_tuned_params = proj.vit_proj.view(-1)

        mse_loss = mse(initial_params, fine_tuned_params)
        loss_ce = F.cross_entropy(logits, batch_y)
        loss = loss_ce + lambda_v * mse_loss

        acc = cls_acc(logits, batch_y)
        correct += acc / 100.0 * len(logits)
        total += len(logits)
        loss_ce_hist.append(loss_ce.item())
        loss_mse_hist.append(mse_loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    scheduler.step()
    print(f"Epoch {epoch+1}/{train_epoch} | Acc {correct/total:.4f} "
          f"| Loss_ce {sum(loss_ce_hist)/len(loss_ce_hist):.4f} "
          f"| Loss_mse {sum(loss_mse_hist)/len(loss_mse_hist):.4f}")

torch.cuda.empty_cache()
proj.eval()


== Projector training ==
Epoch 1/200 | Acc 0.2612 | Loss_ce 2.5625 | Loss_mse 0.0000
Epoch 2/200 | Acc 0.2888 | Loss_ce 2.3984 | Loss_mse 0.0100
Epoch 3/200 | Acc 0.3157 | Loss_ce 2.2637 | Loss_mse 0.0334
Epoch 4/200 | Acc 0.3376 | Loss_ce 2.1504 | Loss_mse 0.0635
Epoch 5/200 | Acc 0.3586 | Loss_ce 2.0547 | Loss_mse 0.0958
Epoch 6/200 | Acc 0.3810 | Loss_ce 1.9727 | Loss_mse 0.1266
Epoch 7/200 | Acc 0.3945 | Loss_ce 1.9043 | Loss_mse 0.1537
Epoch 8/200 | Acc 0.4124 | Loss_ce 1.8467 | Loss_mse 0.1759
Epoch 9/200 | Acc 0.4260 | Loss_ce 1.7969 | Loss_mse 0.1927
Epoch 10/200 | Acc 0.4355 | Loss_ce 1.7549 | Loss_mse 0.2041
Epoch 11/200 | Acc 0.4424 | Loss_ce 1.7178 | Loss_mse 0.2103
Epoch 12/200 | Acc 0.4500 | Loss_ce 1.6865 | Loss_mse 0.2119
Epoch 13/200 | Acc 0.4579 | Loss_ce 1.6602 | Loss_mse 0.2124
Epoch 14/200 | Acc 0.4640 | Loss_ce 1.6377 | Loss_mse 0.2073
Epoch 15/200 | Acc 0.4690 | Loss_ce 1.6182 | Loss_mse 0.1997
Epoch 16/200 | Acc 0.4724 | Loss_ce 1.6025 | Loss_mse 0.1908
Epoch 17

VisProjViT()

In [15]:
# Prepare the test loader
PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))
from copy import deepcopy

from utils import load_cfg_from_cfg_file, merge_cfg_from_list
from main import build_loaders

base_cfg_path = PROJECT_ROOT / "configs" / "base.yaml"
ds_cfg_path = PROJECT_ROOT / "configs" / "cs.yaml"

data_cfg = load_cfg_from_cfg_file(str(base_cfg_path))
data_cfg.update(load_cfg_from_cfg_file(str(ds_cfg_path)))

data_cfg = deepcopy(data_cfg)


def _resolve_paths(cfg, root):
    data_cfg = cfg['data']
    data_cfg['dataset_paths'] = [str((root / p).resolve()) for p in data_cfg['dataset_paths']]
    data_cfg['index_file_names'] = [str((root / 'data/CS_Xplots_2019_2023_train'/ p).resolve()) for p in data_cfg['index_file_names']]
    return cfg
print(PROJECT_ROOT)
data_cfg = _resolve_paths(data_cfg, PROJECT_ROOT)
print(data_cfg)
dl_tr, dl_te, train_tf, test_tf, info = build_loaders(data_cfg)
print(f"Train loader batches: {len(dl_tr)}, Test loader batches: {len(dl_te)}")




/home/hshi/Documents/researchproject/aihab/repo/aihab-clip
SUBSAMPLE_CLASSES: all
aug_views: 1
backbone: ViT-B/16
batch_size: 16
data:
  batch_size: 16
  dataset_paths: ['/home/hshi/Documents/researchproject/aihab/repo/aihab-clip/data/CS_Xplots_2019_2023_train']
  index_file_names: ['/home/hshi/Documents/researchproject/aihab/repo/aihab-clip/data/CS_Xplots_2019_2023_train/CS_Xplots_2019_23_NEW02OCT24.csv']
  num_workers: 0
  preprocessing:
    augmentations:
      bottom_crop: False
      crop: ratio
      flip: False
      random_crop: True
      rotation: True
    resize: 439
  shuffle: True
dataset: cs
feat_batch_size: 0
lambda_funct_1_N: True
lambda_funct_1_N2: False
lambda_v: 0.1
lr_v: 1e-05
method: ProLIP
output_dir: ./results
projector:
  checkpoint: None
  enabled: False
  eval_only: False
  output_dir: ./results
  require_cached_features: True
resolution: 224
root_path: ./
save_features: False
search_lr: False
seed: 1
shots: 0
shuffle: True
train_epoch: 100


Loading images from /home/hshi/Documents/researchproject/aihab/repo/aihab-clip/data/CS_Xplots_2019_2023_train: 100%|██████████| 4233/4233 [01:23<00:00, 50.71file/s]
Loading images from /home/hshi/Documents/researchproject/aihab/repo/aihab-clip/data/CS_Xplots_2019_2023_test: 100%|██████████| 1398/1398 [00:24<00:00, 56.11file/s]


Train loader batches: 263, Test loader batches: 88


In [16]:
# Test the model

# 1) Extract pre-projection features/labels from the test loader
test_features_before, test_labels = compute_image_features(clip_model, dl_te, to_cpu=True)
device = proj.vit_proj.device
test_features_before = test_features_before.to(device)
test_labels = test_labels.to(device)

# 2) Apply the trained projector and evaluate
with torch.no_grad():
    test_features = proj(test_features_before)
    test_features = F.normalize(test_features, dim=-1)
    logits = 100.0 * test_features @ text_weights
    preds = logits.argmax(dim=1)
    acc = (preds == test_labels).float().mean().item() * 100.0

print(f"Test accuracy: {acc:.2f}%")


Test accuracy: 52.86%
