In [1]:
import os, yaml, sys
import numpy as np
import torch
import h5py
import joblib
from torchvision import models
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn.decomposition import PCA
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from torchvision.models.feature_extraction import (
    create_feature_extractor,
    get_graph_node_names,
)

ENV = os.getenv("MY_ENV", "dev")
with open("../../config.yaml", "r") as f:
    config = yaml.safe_load(f)
paths = config[ENV]["paths"]
sys.path.append(paths["src_path"])
from general_utils.utils import create_RDM, get_device
from image_processing.utils import get_usual_transform, load_torchvision_model, map_anns_names, load_timm_model
from image_processing.computational_models import img_feats_extraction, map_image_order_from_ann_to_monkey
from general_utils.utils import print_wise, get_device, decode_matlab_strings, create_RDM, subsample_RDM
device = get_device()

15:23:00 - device being used: mps


## Feature extraction
- 1. create dataset and transform
- 2. loop through dataset
- 3. create a list to keep the indices of presentation of the images (see presentation order and mapping)
- 4. compute RDM and store it
- 5. do PCA and store it
- 6. project features
- 7. map ANNs' presentation order onto neurons' presentation order (so that then we have everything ordered)
- 8. compute RDM and store it
- 9. store dimensionality reduced features and PCA object

Initialize variables

In [None]:
from dataclasses import dataclass

@dataclass
class Cfg:
    imsize: int = 384
    data_folder: str = 'fewer_occlusion'
    batch_size: int = 100
    n_components: int = 3
    model_name: str = 'vit_l_16'
    layer_name: str = 'blocks.18.mlp.fc2'
    monkey_name: str = 'paul' 
    date: str = '230204'


cfg = Cfg()

In [4]:
get_graph_node_names(model)

(['x',
  'attn_mask',
  'patch_embed.getattr',
  'patch_embed.getitem',
  'patch_embed.getitem_1',
  'patch_embed.getitem_2',
  'patch_embed.getitem_3',
  'patch_embed.eq',
  'patch_embed._assert',
  'patch_embed.eq_1',
  'patch_embed._assert_1',
  'patch_embed.proj',
  'patch_embed.flatten',
  'patch_embed.transpose',
  'patch_embed.norm',
  'pos_embed',
  'cls_token',
  'getattr',
  'getitem',
  'expand',
  'cat',
  'add',
  'pos_drop',
  'patch_drop',
  'norm_pre',
  'blocks.0.norm1.blocks_0_norm1_weight',
  'blocks.0.norm1.blocks_0_norm1_bias',
  'blocks.0.norm1.layer_norm',
  'blocks.0.attn.getattr',
  'blocks.0.attn.getitem',
  'blocks.0.attn.getitem_1',
  'blocks.0.attn.getitem_2',
  'blocks.0.attn.qkv',
  'blocks.0.attn.reshape',
  'blocks.0.attn.permute',
  'blocks.0.attn.unbind',
  'blocks.0.attn.getitem_3',
  'blocks.0.attn.getitem_4',
  'blocks.0.attn.getitem_5',
  'blocks.0.attn.q_norm',
  'blocks.0.attn.k_norm',
  'blocks.0.attn.scaled_dot_product_attention',
  'blocks.0.

In [None]:
device = get_device()
transform = get_usual_transform(resize_size=384, normalize=False)

dataset = ImageFolder(
    root=f"{paths['livingstone_lab']}/Stimuli/{cfg.data_folder}/",
    transform=transform,
    is_valid_file=lambda x: not x.endswith("Thumbs.db"), 
    allow_empty=True, 
)
mapping_idx = map_image_order_from_ann_to_monkey(paths, cfg.monkey_name, cfg.date, dataset)
dataloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True)
model = load_timm_model(cfg.model_name, 384, device)
img_feats_extraction(paths, 0, cfg.layer_name, cfg.model_name, model, dataloader, mapping_idx, cfg.monkey_name, cfg.date, cfg.n_components, device)

15:23:01 - device being used: mps
15:23:09 - rank 0 loaded batch 0 of shape (2, 590848)
15:23:09 - rank 0 loaded batch 1 of shape (2, 590848)
15:23:09 - rank 0 loaded batch 2 of shape (2, 590848)
15:23:09 - rank 0 loaded batch 3 of shape (2, 590848)
15:23:10 - rank 0 loaded batch 4 of shape (2, 590848)
15:23:10 - rank 0 loaded batch 5 of shape (2, 590848)
15:23:10 - rank 0 loaded batch 6 of shape (2, 590848)
15:23:10 - rank 0 loaded batch 7 of shape (2, 590848)
15:23:11 - rank 0 loaded batch 8 of shape (2, 590848)
15:23:11 - rank 0 loaded batch 9 of shape (2, 590848)
15:23:11 - rank 0 loaded batch 10 of shape (2, 590848)
15:23:11 - rank 0 loaded batch 11 of shape (2, 590848)
15:23:12 - rank 0 loaded batch 12 of shape (2, 590848)
15:23:12 - rank 0 loaded batch 13 of shape (2, 590848)
15:23:12 - rank 0 loaded batch 14 of shape (2, 590848)
15:23:12 - rank 0 loaded batch 15 of shape (2, 590848)
15:23:13 - rank 0 loaded batch 16 of shape (2, 590848)
15:23:13 - rank 0 loaded batch 17 of shap

KeyboardInterrupt: 

create dataset and transform

In [3]:
transform = get_usual_transform(resize_size=384, normalize=False)

dataset = ImageFolder(
    root=f"{paths['livingstone_lab']}/Stimuli/{cfg.data_folder}/",
    transform=transform,
    is_valid_file=lambda x: not x.endswith("Thumbs.db"), 
    allow_empty=True, 
)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

load model 

In [4]:
model = load_timm_model(cfg.model_name, 384, device)

loop through dataset

create a list to keep the indices of presentation of the images (see presentation order and mapping)

In [None]:
allimgs_path = f"{paths['livingstone_lab']}/tiziano/data/{cfg.monkey_name}_allimages{cfg.date}.mat"
with h5py.File(allimgs_path, "r") as f:
    refs = f["allimages"][:]      # shape (N, 1) of object refs
    image_names = decode_matlab_strings(f, refs)
    image_names = list(dict.fromkeys(image_names)) 
presentation_order = [os.path.basename(path) for path, _ in dataset.samples]
print(presentation_order)
print(image_names)
# Create a mapping from list_b to list_a
mapping_idx = [presentation_order.index(x) for x in image_names]
mapped_order = [presentation_order[i] for i in mapping_idx]
print(mapped_order)

['BigAnimate_01.jpg', 'BigAnimate_02.jpg', 'BigAnimate_03.jpg', 'BigAnimate_04.jpg', 'BigAnimate_05.jpg', 'BigAnimate_06.jpg', 'BigAnimate_07.jpg', 'BigAnimate_08.jpg', 'BigAnimate_09.jpg', 'BigAnimate_10.jpg', 'BigAnimate_11.jpg', 'BigAnimate_12.jpg', 'BigAnimate_13.jpg', 'BigAnimate_14.jpg', 'BigAnimate_15.jpg', 'BigAnimate_16.jpg', 'BigAnimate_17.jpg', 'BigAnimate_18.jpg', 'BigAnimate_19.jpg', 'BigAnimate_20.jpg', 'BigAnimate_21.jpg', 'BigAnimate_22.jpg', 'BigAnimate_23.jpg', 'BigAnimate_24.jpg', 'BigAnimate_25.jpg', 'BigAnimate_26.jpg', 'BigAnimate_27.jpg', 'BigAnimate_28.jpg', 'BigAnimate_29.jpg', 'BigAnimate_30.jpg', 'BigAnimate_31.jpg', 'BigAnimate_32.jpg', 'BigAnimate_33.jpg', 'BigAnimate_34.jpg', 'BigAnimate_35.jpg', 'BigAnimate_36.jpg', 'BigAnimate_37.jpg', 'BigAnimate_38.jpg', 'BigAnimate_39.jpg', 'BigAnimate_40.jpg', 'BigAnimate_41.jpg', 'BigAnimate_42.jpg', 'BigAnimate_43.jpg', 'BigAnimate_44.jpg', 'BigAnimate_45.jpg', 'BigAnimate_46.jpg', 'BigAnimate_47.jpg', 'BigAnimate_

map ANNs' presentation order onto neurons' presentation order (so that then we have everything ordered)   
+ compute RDM and store it

In [None]:
#all_feats = all_feats[mapping, :, :]
RDM_vec = create_RDM(all_feats.T)

# np.savez_compressed(RDM_save_name, RDM_vec)

array([0.87674255, 0.94231483, 0.89238954, 0.86995213, 0.94218925,
       0.90243311, 0.80311104, 0.78804938, 0.85510519, 0.92680236,
       0.92788447, 0.92866926, 0.84817898, 0.87856231, 0.79127428])

do PCA, store it and project features

In [None]:
pca = PCA(n_components=cfg.n_components)
pca.fit(all_feats)
# joblib.dump(pca, pca_save_name)
all_feats_redu = pca.transform(all_feats)
# np.savez_compressed(feats_save_name, all_feats_redu.T)