In [1]:
import os, yaml, sys
import numpy as np
from sklearn.decomposition import IncrementalPCA
from torchvision.models.feature_extraction import (
    create_feature_extractor,
    get_graph_node_names,
)
import torch 
import cv2

ENV = os.getenv("MY_ENV", "dev")
with open("../../config.yaml", "r") as f:
    config = yaml.safe_load(f)
paths = config[ENV]["paths"]
sys.path.append(paths["src_path"])
from general_utils.utils import print_wise
from image_processing.utils import read_video, get_video_dimensions, concatenate_frames_batch, shuffle_frames


## Steps
(parallel over the layers of the video)
1. Load videos until you get approx 2 or 3 times the batch-size (or more?) (for very long vids (some of the ones with arcaro for instance - but check), just take the first 20 seconds or so...) (can we estimate the optimal order based on get_video_dimensions?) -> then shuffle, split evenly the frames and pass it to iPCA 
2. Normalize, format and shuffle them
    - If gaze dependent, load gaze, upsample (in time) video and extract gaze-dep spatial window
3. Compute iPCA
4. Save eigenvectors and eigenvalues


In [2]:
vids_dir = f"{paths['livingstone_lab']}/Stimuli/Movies/all_videos"
all_files = os.listdir(vids_dir) 
fn_list = [f for f in all_files if "YDX" in f]
print(fn_list)

['YDXJ0100A.MP4', 'YDXJ0100.MP4', 'YDXJ0090A.MP4', 'YDXJ0086A.MP4', 'YDXJ0100B.MP4', 'YDXJ0088.MP4', 'YDXJ0090B.MP4', 'YDXJ0086B.MP4', 'YDXJ0087B.MP4', 'YDXJ0099.MP4', 'YDXJ0091B.MP4', 'YDXJ0079_80.MP4', 'YDXJ0096A.MP4', 'YDXJ0097A.MP4', 'YDXJ0097B.MP4', 'YDXJ0094B.mp4', 'YDXJ0099D.MP4', 'YDXJ0094C.mp4', 'YDXJ0095E.MP4', 'YDXJ0094A.mp4', 'YDXJ0099F.MP4', 'YDXJ0099C.MP4', 'YDXJ0099B.MP4', 'YDXJ0094E.mp4', 'YDXJ0099A.MP4', 'YDXJ0088B.MP4', 'YDXJ0096.MP4', 'YDXJ0097.MP4', 'YDXJ0088A.MP4', 'YDXJ0095.MP4', 'YDXJ0094.MP4', 'YDXJ0081_82.MP4', 'YDXJ0090.MP4', 'YDXJ0091.MP4', 'YDXJ0085.MP4', 'YDXJ0085B.MP4', 'YDXJ0083_84.MP4', 'YDXJ0093.MP4', 'YDXJ0087.MP4', 'YDXJ0093A.MP4', 'YDXJ0086.MP4', 'YDXJ0092.MP4']


In [3]:
frames_per_vid = []
long_vids = []
for i in fn_list:
    video_path = f"{vids_dir}/{i}"
    cap = cv2.VideoCapture(video_path)
    _, _, n_frames = get_video_dimensions(cap)
    fps = cap.get(cv2.CAP_PROP_FPS)
    if n_frames/fps > 20:
        n_frames = fps*20
        long_vids.append(True)
    else:
        long_vids.append(False)
    frames_per_vid.append(n_frames)

In [4]:
tot_frame_num = round(np.sum(frames_per_vid))
batch_size = 1000
counter = 0
n_batches = round(tot_frame_num/batch_size)
splits = np.array_split(np.arange(tot_frame_num), n_batches)
batch_sizes = []
for batch_idx in splits:
    batch_sizes.append(len(batch_idx)) # stores the current batch size
batch_sizes = np.array(batch_sizes)

In [5]:
# """
# Returns True if the list or np.array is empty, False otherwise.
# Works for both lists and NumPy arrays.
# """
# def is_empty(x):
#     if x is None:
#         return True
#     try:
#         # Works for np.array
#         return x.size == 0
#     except AttributeError:
#         # If no size attribute, fallback to len() (lists, tuples)
#         return len(x) == 0
# # EOF

# """
# Resize a video stored as a NumPy array of shape (n_frames, H, W, C).

# INPUT:
#     - video: np.ndarray -> (n_frames, H, W, C)
#     - new_height: int -> desired output height
#     - new_width: int -> desired output width
#     - interpolation: cv2 interpolation method (default: bilinear)

# OUTPUT:
#     - resized_video: np.ndarray -> (n_frames, new_height, new_width, C)    
# """
# def resize_video_array(video, new_height, new_width, interpolation=cv2.INTER_LINEAR, normalize=True):
#     resized_video = np.stack([cv2.resize(frame, (new_width, new_height), interpolation=interpolation) for frame in video])
#     if normalize:
#         mean = resized_video.mean(axis=(0,1,2))
#         std = resized_video.std(axis=(0,1,2)) + 1e-8
#         resized_video = (resized_video - mean) / std
#     # end if normalize:
#     return resized_video
# # EOF

# """
# concatenate_frames_batch
# Concatenate a batch of video frames from multiple videos, optionally processing only
# the first part of long videos and keeping leftover frames from previous batches.

# INPUT:
#     - paths: dict -> paths to the video files
#     - rank: int -> worker rank
#     - frames_batch: list or np.ndarray -> leftover frames from previous batch
#     - curr_video_idx: int -> current video index in fn_list
#     - idx: int -> current batch index
#     - batches_to_proc_togeth: int -> number of batches to process consecutively
#     - batch_sizes: list/array -> sizes of each batch in frames
#     - new_h, new_w: int -> target frame height and width
#     - long_vids: list of bools -> flags indicating whether each video is long
#     - vid_duration_lim: int (default 20) -> max duration in seconds for long videos
#     - normalize: bool (default True) -> whether to normalize frames

# OUTPUT:
#     - frames_batch: np.ndarray -> concatenated frames for the batch
#     - progression: int -> updated video index after processing

# """
# def concatenate_frames_batch(paths, rank, frames_batch, curr_video_idx, curr_batch_idx, batches_to_proc_togeth, batch_sizes, new_h, new_w, long_vids, vid_duration_lim=20, normalize=True):
#     n_batches = len(batch_sizes)
#     idx_tot = [curr_batch_idx + i for i in range(batches_to_proc_togeth) if curr_batch_idx + i < n_batches] # takes the next $batches_to_proc frames filtering for out of range indices 
#     curr_tot_batch_size = np.sum(batch_sizes[idx_tot])
#     print(curr_tot_batch_size)
#     cumulative_frames_sum = 0
#     if is_empty(frames_batch):
#         frames_batch = [] # otherwise we have arrays of inconsistent size to concatenate
#     else:
#         cumulative_frames_sum += frames_batch.shape[0]
#         frames_batch = [frames_batch] # makes it a list with all the frames remained from the previous batch (ideally we read 3 batches and shuffle)
#     # end if frames_batch:
#     while cumulative_frames_sum < curr_tot_batch_size:
#         fn = fn_list[curr_video_idx]
#         if long_vids[curr_video_idx]: # if the video is marked as long
#             video = read_video(paths, rank, fn, vid_duration=vid_duration_lim) # if the video is too long, we just process the beginning (vid_duration_lim is in sec)
#         else:
#             video = read_video(paths, rank, fn, vid_duration=0)
#         # end if long_vids[progression]: 
#         video = resize_video_array(video, new_h, new_w, normalize=False)
#         curr_video_idx += 1
#         curr_frames_n = video.shape[0] 
#         cumulative_frames_sum += curr_frames_n
#         frames_batch.append(video)
#     # end while cumulative_frames_sum < curr_tot_batch_size:
#     frames_batch = np.concatenate(frames_batch, axis=0)
#     return frames_batch, curr_video_idx
# # EOF


# """
# shuffle_frames
# Randomly shuffle the frames of a video array along the 0th dimension.

# INPUT:
#     - video: np.ndarray, shape (n_frames, H, W, C) -> video to shuffle

# OUTPUT:
#     - shuffled_video: np.ndarray, same shape as input -> frames randomly permuted
# """
# def shuffle_frames(video):
#     n_frames = video.shape[0] 
#     indices = np.arange(n_frames)
#     np.random.shuffle(indices)
#     shuffled_video = video[indices, :, :, :]
#     return shuffled_video
# # EOF

In [6]:
rank = 0
progression = 0 
vid_duration_lim = 20 # sec
frames_batch = [] # initialized as an empty array
frames_passed = 0
batches_to_proc = 1
new_h, new_w = 224, 224
for idx, curr_batch_size in enumerate(batch_sizes):
    frames_batch, progression = concatenate_frames_batch(paths, rank, fn_list, frames_batch, progression, idx, batches_to_proc, batch_sizes, new_h, new_w, long_vids, vid_duration_lim)
    frames_batch = shuffle_frames(frames_batch)
    input = frames_batch[:curr_batch_size, :, :, :]
    frames_batch = frames_batch[curr_batch_size:, :, :, :]
    frames_passed += input.shape[0]
    print(frames_batch.shape, input.shape)
    #ADD ANN pass
    #ADD PCA step

print(tot_frame_num, frames_passed)
tot_frame_num == frames_passed

983
12:34:48 - rank 0 YDXJ0100A.MP4 read successfully
12:34:51 - rank 0 YDXJ0100.MP4 read successfully
(544, 224, 224, 3) (983, 224, 224, 3)
983
12:34:52 - rank 0 YDXJ0090A.MP4 read successfully
12:34:53 - rank 0 YDXJ0086A.MP4 read successfully
(260, 224, 224, 3) (983, 224, 224, 3)
983
12:34:54 - rank 0 YDXJ0100B.MP4 read successfully
12:34:56 - rank 0 YDXJ0088.MP4 read successfully
(783, 224, 224, 3) (983, 224, 224, 3)
983
12:34:57 - rank 0 YDXJ0090B.MP4 read successfully
(124, 224, 224, 3) (983, 224, 224, 3)
983
12:34:58 - rank 0 YDXJ0086B.MP4 read successfully
12:34:59 - rank 0 YDXJ0087B.MP4 read successfully
12:35:02 - rank 0 YDXJ0099.MP4 read successfully
(999, 224, 224, 3) (983, 224, 224, 3)
983
(16, 224, 224, 3) (983, 224, 224, 3)
983
12:35:03 - rank 0 YDXJ0091B.MP4 read successfully
12:35:04 - rank 0 YDXJ0079_80.MP4 read successfully
12:35:05 - rank 0 YDXJ0096A.MP4 read successfully
(11, 224, 224, 3) (983, 224, 224, 3)
983
12:35:06 - rank 0 YDXJ0097A.MP4 read successfully
12:35

True

In [None]:
resized = a 


In [None]:
"""
get_layer_out_shape
Computes the output shape (excluding batch size) of a specific layer 
from a given PyTorch feature extractor when applied to a dummy input 
image of size (1, 3, 224, 224).
INPUT:
- feature_extractor: torch.nn.Module -> A PyTorch model (typically a feature extractor created via torchvision.models.feature_extraction.create_feature_extractor)
                                        which outputs a dictionary of intermediate activations.
            
- layer_name: str -> The name of the layer for which the output shape is desired. This must be one of the keys returned by the feature_extractor.

OUTPUT:
- tmp_shape: Tuple(Int) -> A tuple representing the shape of the output tensor from the specified layer, excluding the batch dimension. For example,
                          (512, 7, 7) for a convolutional layer or (768,) for a transformer block.
            
Example Usage:
    >>> from torchvision.models import resnet18
    >>> from torchvision.models.feature_extraction import create_feature_extractor
    >>> model = resnet18(pretrained=True).eval()
    >>> feat_ext = create_feature_extractor(model, return_nodes=["layer1.0.relu_1"])
    >>> shape = get_layer_out_shape(feat_ext, "layer1.0.relu_1")
    >>> print(shape)
    (64, 56, 56)
"""
def get_layer_output_shape(feature_extractor, layer_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # thi if leave it here or not...
    with torch.no_grad():
        in_proxy = torch.randn(1, 3, 224, 224).to(device)
        tmp_shape = feature_extractor(in_proxy)[layer_name].shape[1:]
    return tmp_shape


def ipca_core(paths, rank, layer_name, model_name, n_components, model, loader, device):
    save_name = (f"imagenet_val_{model_name}_{layer_name}_pca_model_{n_components}_PCs.pkl")
    path = os.path.join(paths["results_path"], save_name)
    if os.path.exists(path):
        print_wise(f"{path} already exists")
    else:
        print_wise(f"Fitting PCA for layer: {layer_name}", rank=rank)
        feature_extractor = create_feature_extractor(
            model, return_nodes=[layer_name]
        ).to(device)
        tmp_shape = get_layer_output_shape(feature_extractor, layer_name)
        n_features = np.prod(tmp_shape)  # [C, H, W] -> C*H*W
        n_components_layer = min(n_features, n_components)  # Limit to number of features
        pca = IncrementalPCA(n_components=n_components_layer)
        counter = 0
        for inputs, _ in loader:
            counter += 1
            print_wise(f"starting batch {counter}", rank=rank)
            with torch.no_grad():
                inputs = inputs.to(device)
                feats = feature_extractor(inputs)[layer_name]
                feats = feats.view(feats.size(0), -1).cpu().numpy()
                pca.partial_fit(feats)

        joblib.dump(pca, path) # better this or pkl?
        print_wise(f"Saved PCA for {layer_name} at {path}", rank=rank)

