In [11]:
import torch
from VideoMAEv2.models.modeling_pretrain import pretrain_videomae_base_patch16_224 as VideoMAEv2

# Initialize model
model = VideoMAEv2()
model.head = torch.nn.Identity()  # Remove classification head for feature extraction

# Load state_dict directly
state_dict = torch.load("pytorch_model_l.bin", map_location="cpu")  # No ["model"] here!

# Load weights into model
model.load_state_dict(state_dict, strict=False)

# Save as .pth
torch.save(model.state_dict(), "videomaev2.pth")
print("✅ Successfully converted Hugging Face .bin to videomaev2.pth")


✅ Successfully converted Hugging Face .bin to videomaev2.pth


In [2]:
import os
import torch
import numpy as np
import cv2
from tqdm import tqdm
from einops import rearrange
from VideoMAEv2.models.modeling_pretrain import pretrain_videomae_small_patch16_224 as VideoMAEv2
import torchvision.transforms as transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_frames = 16
stride = 1
data_path = "gg/gtea/Videos"
save_path = "features/gteasmall"
ckpt_path = "vit_s_k710_dl_from_giant.pth"

# Video preprocessing transforms
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def read_video_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        success, frame = cap.read()
        if not success:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    cap.release()
    return frames

def preprocess_frames(frames):
    """Preprocess frames for VideoMAE input"""
    processed_frames = []
    for frame in frames:
        # Apply transforms
        frame_tensor = transform(frame)
        processed_frames.append(frame_tensor)
    
    # Stack frames: [T, C, H, W]
    video_tensor = torch.stack(processed_frames)
    # Rearrange to [C, T, H, W] for VideoMAE
    video_tensor = rearrange(video_tensor, 't c h w -> c t h w')
    # Add batch dimension: [1, C, T, H, W]
    video_tensor = video_tensor.unsqueeze(0)
    
    return video_tensor

def compute_num_patches(video_tensor):
    # Video tensor shape: [B, C, T, H, W]
    _, _, T, H, W = video_tensor.shape
    patch_size = 16
    tubelet_size = 2
    num_patches_per_frame = (H // patch_size) * (W // patch_size)
    num_temporal_patches = T // tubelet_size
    return num_temporal_patches * num_patches_per_frame

def extract_features_from_video(video_path, model):
    frames = read_video_frames(video_path)
    if len(frames) < num_frames:
        print(f"⚠️ Skipping {video_path}: only {len(frames)} frames")
        return None

    features = []
    for start in range(0, len(frames) - num_frames + 1, stride):
        clip = frames[start:start + num_frames]
        
        # Preprocess the clip
        video_tensor = preprocess_frames(clip).to(device)  # [1, C, T, H, W]

        with torch.no_grad():
            batch_size = video_tensor.shape[0]
            num_patches = compute_num_patches(video_tensor)
            # Create all-zero mask for no masking during inference
            mask = torch.zeros(batch_size, num_patches, dtype=torch.bool, device=device)

            # Extract features using VideoMAE encoder
            outputs = model(video_tensor, mask)

            # If outputs is a tuple, take the first element (features)
            if isinstance(outputs, tuple):
                feat = outputs[0]
            else:
                feat = outputs
            
            # Global average pooling over spatial and temporal dims
            if feat.dim() == 5:  # [B, C, T, H, W]
                feat = feat.mean(dim=[2, 3, 4])  # [B, C]
            elif feat.dim() == 4:  # [B, T, H, W] or [B, C, H, W]
                feat = feat.mean(dim=[2, 3])  # [B, T] or [B, C]
            elif feat.dim() == 3:  # [B, T, D]
                feat = feat.mean(dim=1)  # [B, D]

            # Normalize features
            feat = torch.nn.functional.normalize(feat, dim=-1)
            features.append(feat.squeeze(0).cpu().numpy())

    return np.stack(features) if features else None

def main():
    os.makedirs(save_path, exist_ok=True)

    print(f"🔧 Loading VideoMAE model from {ckpt_path}")
    try:
        # Initialize model
        model = VideoMAEv2()
        model.head = torch.nn.Identity()  # Remove classification head for feature extraction
        
        # Load checkpoint
        if os.path.exists(ckpt_path):
            state_dict = torch.load(ckpt_path, map_location="cpu")
            model.load_state_dict(state_dict, strict=False)
            print("✅ Model loaded from checkpoint")
        else:
            print(f"❌ Checkpoint not found: {ckpt_path}")
            return
        
        model = model.to(device)
        model.eval()
        print("✅ Model loaded successfully")
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        return

    if not os.path.exists(data_path):
        print(f"❌ Data path does not exist: {data_path}")
        return

    video_list = sorted([f for f in os.listdir(data_path) if f.endswith(('.mp4', '.avi', '.mov'))])
    
    if not video_list:
        print(f"No video files found in {data_path}")
        return

    print(f"Found {len(video_list)} videos to process")

    for video_name in tqdm(video_list, desc="Extracting features"):
        video_path = os.path.join(data_path, video_name)
        save_file = os.path.join(save_path, f"{os.path.splitext(video_name)[0]}.npy")

        if os.path.exists(save_file):
            print(f" Skipping {video_name}: features already exist")
            continue

        try:
            features = extract_features_from_video(video_path, model)
            if features is not None:
                np.save(save_file, features.astype(np.float32))
                print(f" Saved {features.shape[0]} features to {save_file}")
            else:
                print(f"Failed to extract features from {video_name}")
        except Exception as e:
            print(f" Error processing {video_name}: {e}")

if __name__ == '__main__':
    main()


🔧 Loading VideoMAE model from vit_s_k710_dl_from_giant.pth
✅ Model loaded from checkpoint
✅ Model loaded successfully
Found 28 videos to process


Extracting features:   4%|▎         | 1/28 [01:03<28:44, 63.86s/it]

 Saved 928 features to features/gteasmall/S1_Cheese_C1.npy


Extracting features:   7%|▋         | 2/28 [02:28<32:57, 76.05s/it]

 Saved 1220 features to features/gteasmall/S1_CofHoney_C1.npy


Extracting features:  11%|█         | 3/28 [03:54<33:33, 80.54s/it]

 Saved 1163 features to features/gteasmall/S1_Coffee_C1.npy


Extracting features:  14%|█▍        | 4/28 [04:46<27:39, 69.16s/it]

 Saved 703 features to features/gteasmall/S1_Hotdog_C1.npy


Extracting features:  18%|█▊        | 5/28 [06:27<31:00, 80.91s/it]

 Saved 1369 features to features/gteasmall/S1_Pealate_C1.npy


Extracting features:  21%|██▏       | 6/28 [08:29<34:42, 94.66s/it]

 Saved 1628 features to features/gteasmall/S1_Peanut_C1.npy


Extracting features:  25%|██▌       | 7/28 [10:56<39:08, 111.84s/it]

 Saved 1994 features to features/gteasmall/S1_Tea_C1.npy


Extracting features:  29%|██▊       | 8/28 [11:42<30:16, 90.84s/it] 

 Saved 619 features to features/gteasmall/S2_Cheese_C1.npy


Extracting features:  32%|███▏      | 9/28 [12:42<25:44, 81.29s/it]

 Saved 808 features to features/gteasmall/S2_CofHoney_C1.npy


Extracting features:  36%|███▌      | 10/28 [14:56<29:13, 97.44s/it]

 Saved 1799 features to features/gteasmall/S2_Coffee_C1.npy


Extracting features:  39%|███▉      | 11/28 [15:55<24:18, 85.81s/it]

 Saved 796 features to features/gteasmall/S2_Hotdog_C1.npy


Extracting features:  43%|████▎     | 12/28 [17:21<22:55, 85.97s/it]

 Saved 1166 features to features/gteasmall/S2_Pealate_C1.npy


Extracting features:  46%|████▋     | 13/28 [19:09<23:06, 92.42s/it]

 Saved 1450 features to features/gteasmall/S2_Peanut_C1.npy


Extracting features:  50%|█████     | 14/28 [20:52<22:20, 95.75s/it]

 Saved 1397 features to features/gteasmall/S2_Tea_C1.npy


Extracting features:  54%|█████▎    | 15/28 [21:59<18:50, 86.96s/it]

 Saved 898 features to features/gteasmall/S3_Cheese_C1.npy


Extracting features:  57%|█████▋    | 16/28 [23:04<16:04, 80.41s/it]

 Saved 877 features to features/gteasmall/S3_CofHoney_C1.npy


Extracting features:  61%|██████    | 17/28 [24:31<15:08, 82.55s/it]

 Saved 1175 features to features/gteasmall/S3_Coffee_C1.npy


Extracting features:  64%|██████▍   | 18/28 [25:35<12:48, 76.80s/it]

 Saved 847 features to features/gteasmall/S3_Hotdog_C1.npy


Extracting features:  68%|██████▊   | 19/28 [27:03<12:01, 80.21s/it]

 Saved 1154 features to features/gteasmall/S3_Pealate_C1.npy


Extracting features:  71%|███████▏  | 20/28 [28:33<11:05, 83.17s/it]

 Saved 949 features to features/gteasmall/S3_Peanut_C1.npy


Extracting features:  75%|███████▌  | 21/28 [30:33<10:58, 94.10s/it]

 Saved 1346 features to features/gteasmall/S3_Tea_C1.npy


Extracting features:  79%|███████▊  | 22/28 [31:31<08:20, 83.35s/it]

 Saved 790 features to features/gteasmall/S4_Cheese_C1.npy


Extracting features:  82%|████████▏ | 23/28 [32:34<06:26, 77.34s/it]

 Saved 856 features to features/gteasmall/S4_CofHoney_C1.npy


Extracting features:  86%|████████▌ | 24/28 [33:46<05:02, 75.57s/it]

 Saved 949 features to features/gteasmall/S4_Coffee_C1.npy


Extracting features:  89%|████████▉ | 25/28 [34:34<03:22, 67.36s/it]

 Saved 640 features to features/gteasmall/S4_Hotdog_C1.npy


Extracting features:  93%|█████████▎| 26/28 [36:07<02:30, 75.08s/it]

 Saved 1229 features to features/gteasmall/S4_Pealate_C1.npy


Extracting features:  96%|█████████▋| 27/28 [37:17<01:13, 73.53s/it]

 Saved 919 features to features/gteasmall/S4_Peanut_C1.npy


Extracting features: 100%|██████████| 28/28 [38:43<00:00, 82.97s/it]

 Saved 1136 features to features/gteasmall/S4_Tea_C1.npy





In [1]:
import os
import torch
import numpy as np
import cv2
from tqdm import tqdm
from einops import rearrange
from VideoMAEv2.models.modeling_pretrain import pretrain_videomae_base_patch16_224 as VideoMAEv2
import torchvision.transforms as transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_frames = 16
stride = 1
data_path = "gg/gtea/Videos"
save_path = "features/gteam"
ckpt_path = "videomaev2.pth"

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def read_video_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        success, frame = cap.read()
        if not success:
            break
        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    cap.release()
    return frames

def preprocess_frames(frames):
    processed = [transform(f) for f in frames]
    video_tensor = torch.stack(processed)  # [T, C, H, W]
    video_tensor = rearrange(video_tensor, 't c h w -> c t h w').unsqueeze(0)  # [1, C, T, H, W]
    return video_tensor

def compute_num_patches(video_tensor):
    _, _, T, H, W = video_tensor.shape
    patch_size = 16
    tubelet_size = 2
    num_patches_per_frame = (H // patch_size) * (W // patch_size)
    return (T // tubelet_size) * num_patches_per_frame

def extract_features_from_video(video_path, model):
    frames = read_video_frames(video_path)
    if len(frames) < num_frames:
        print(f"Skipping {video_path}: only {len(frames)} frames")
        return None

    features = []
    for start in range(0, len(frames) - num_frames + 1, stride):
        clip = frames[start:start + num_frames]
        video_tensor = preprocess_frames(clip).to(device)

        with torch.no_grad():
            batch_size = video_tensor.shape[0]
            num_patches = compute_num_patches(video_tensor)
            mask = torch.zeros(batch_size, num_patches, dtype=torch.bool, device=device)
            outputs = model(video_tensor, mask)

            feat = outputs[0] if isinstance(outputs, tuple) else outputs

            if feat.dim() == 5:
                feat = feat.mean(dim=[2, 3, 4])
            elif feat.dim() == 4:
                feat = feat.mean(dim=[2, 3])
            elif feat.dim() == 3:
                feat = feat.mean(dim=1)

            feat = torch.nn.functional.normalize(feat, dim=-1)
            features.append(feat.squeeze(0).cpu().numpy())

    return np.stack(features) if features else None

def main():
    os.makedirs(save_path, exist_ok=True)

    print(f"Loading model from {ckpt_path}")
    model = VideoMAEv2()
    model.head = torch.nn.Identity()
    if os.path.exists(ckpt_path):
        state_dict = torch.load(ckpt_path, map_location="cpu")
        model.load_state_dict(state_dict, strict=False)
        print("Model loaded")
    else:
        print(f"Checkpoint not found: {ckpt_path}")
        return
    model = model.to(device).eval()

    if not os.path.exists(data_path):
        print(f"Data path does not exist: {data_path}")
        return

    videos = sorted([f for f in os.listdir(data_path) if f.endswith(('.mp4', '.avi', '.mov'))])
    if not videos:
        print(f"No videos found in {data_path}")
        return

    print(f"Found {len(videos)} videos")

    for video_name in tqdm(videos, desc="Extracting features"):
        save_file = os.path.join(save_path, f"{os.path.splitext(video_name)[0]}.npy")
        if os.path.exists(save_file):
            continue
        try:
            features = extract_features_from_video(os.path.join(data_path, video_name), model)
            if features is not None:
                np.save(save_file, features.astype(np.float32))
        except Exception as e:
            print(f"Error processing {video_name}: {e}")

if __name__ == '__main__':
    main()


  from .autonotebook import tqdm as notebook_tqdm


Loading model from videomaev2.pth
Model loaded
Found 28 videos


Extracting features: 100%|██████████| 28/28 [00:00<00:00, 68759.08it/s]
