In [1]:
import torch
import os
from VideoMAEv2.models.modeling_pretrain import pretrain_videomae_base_patch16_224 as VideoMAEv2


# Step 2: Initialize matching model (ViT-Base)
model = VideoMAEv2()
model.head = torch.nn.Identity()  # Remove classification head for feature extraction



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = torch.load("./converted/videomaev2-base-p16-16f-pretrain.pth", map_location='cpu')

if 'model' in checkpoint:
    state_dict = checkpoint['model']
else:
    state_dict = checkpoint

model.load_state_dict(state_dict, strict=False)


<All keys matched successfully>

In [4]:
import cv2
import torch
import numpy as np
from torchvision import transforms
from einops import rearrange

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.eval()

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]


# Preprocessing transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])


In [6]:
def read_video_frames(video_path, num_frames=16, stride=4):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    cap.release()

    clips = []
    for i in range(0, len(frames) - num_frames + 1, stride):
        clip = frames[i:i+num_frames]
        if len(clip) == num_frames:
            clips.append(clip)
    return clips

In [None]:
from PIL import Image

def extract_features_from_clip(clip, model):
    from PIL import Image
    clip_tensor = torch.stack([transform(Image.fromarray(frame)) for frame in clip])  # [T, C, H, W]
    clip_tensor = rearrange(clip_tensor, 't c h w -> 1 c t h w').to(device)           # [1, C, T, H, W]

    with torch.no_grad():
        x = model.encoder.patch_embed(clip_tensor)  # [B, N, C]
        _, N, _ = x.shape
        mask = torch.zeros((1, N), dtype=torch.bool).to(device)

        features = model.encoder(clip_tensor, mask)  # pass mask with correct length
        features = features.mean(dim=1)  # global average pool

    return features.squeeze(0).cpu().numpy()


# Process GTEA videos
VIDEO_DIR = './gg/gtea/Videos'
OUT_DIR = './features/gtea'
os.makedirs(OUT_DIR, exist_ok=True)

for vid_file in os.listdir(VIDEO_DIR):
    if not vid_file.endswith('.mp4'):
        continue
    print(f"Processing: {vid_file}")
    video_path = os.path.join(VIDEO_DIR, vid_file)
    clips = read_video_frames(video_path, num_frames=16, stride=1)

    all_feats = []
    for clip in clips:
        feat = extract_features_from_clip(clip, model)
        all_feats.append(feat)

    all_feats = np.stack(all_feats)  
    np.save(os.path.join(OUT_DIR, vid_file.replace('.mp4', '.npy')), all_feats)

Processing: S3_CofHoney_C1.mp4
Processing: S2_Hotdog_C1.mp4
Processing: S4_Tea_C1.mp4
Processing: S4_CofHoney_C1.mp4
Processing: S3_Hotdog_C1.mp4
Processing: S2_Peanut_C1.mp4
Processing: S3_Coffee_C1.mp4
Processing: S1_Pealate_C1.mp4
Processing: S1_CofHoney_C1.mp4
Processing: S2_CofHoney_C1.mp4
Processing: S2_Tea_C1.mp4
Processing: S2_Pealate_C1.mp4
Processing: S3_Cheese_C1.mp4
Processing: S1_Coffee_C1.mp4
Processing: S2_Coffee_C1.mp4
Processing: S3_Tea_C1.mp4
Processing: S2_Cheese_C1.mp4
Processing: S4_Coffee_C1.mp4
Processing: S3_Peanut_C1.mp4
Processing: S4_Hotdog_C1.mp4
Processing: S1_Cheese_C1.mp4
Processing: S3_Pealate_C1.mp4
Processing: S1_Tea_C1.mp4
Processing: S4_Cheese_C1.mp4
Processing: S1_Hotdog_C1.mp4
Processing: S1_Peanut_C1.mp4
Processing: S4_Peanut_C1.mp4
Processing: S4_Pealate_C1.mp4


In [11]:
all_feats = np.stack(all_feats)  # Shape: [num_clips, feature_dim]


In [7]:
import os
import torch
import numpy as np
from PIL import Image
from einops import rearrange
from torchvision import transforms
from moviepy.editor import VideoFileClip
from scipy.ndimage import zoom

# Preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def read_video_frames(video_path, num_frames=16, stride=1):
    """Reads overlapping clips of frames from a video file."""
    clip = VideoFileClip(video_path)
    frames = [frame[:, :, ::-1] for frame in clip.iter_frames(fps=clip.fps)]
    clip.reader.close()

    if clip.audio is not None:
        clip.audio.reader.close_proc()

    total_frames = len(frames)
    clips = []

    for i in range(0, total_frames - num_frames + 1, stride):
        clips.append(frames[i:i + num_frames])

    return clips, total_frames


def extract_features_from_clip(clip, model):
    """Extract one feature vector from a clip using VideoMAEv2."""
    clip_tensor = torch.stack([transform(Image.fromarray(frame)) for frame in clip])  # [T, C, H, W]
    clip_tensor = rearrange(clip_tensor, 't c h w -> 1 c t h w').to(device)          # [1, C, T, H, W]

    with torch.no_grad():
        x = model.encoder.patch_embed(clip_tensor)
        _, N, _ = x.shape
        mask = torch.zeros((1, N), dtype=torch.bool).to(device)

        features = model.encoder(clip_tensor, mask)
        features = features.mean(dim=1)  # global average pool

    return features.squeeze(0).cpu().numpy()

# Paths
VIDEO_DIR = '/home/cair/Dharmendra/gg/gtea/Videos'
OUT_DIR = '/home/cair/Dharmendra/features/gtea2'
os.makedirs(OUT_DIR, exist_ok=True)

# Loop over videos
for vid_file in os.listdir(VIDEO_DIR):
    if not vid_file.endswith('.mp4'):
        continue

    print(f"Processing: {vid_file}")
    video_path = os.path.join(VIDEO_DIR, vid_file)

    clips, total_frames = read_video_frames(video_path, num_frames=16, stride=1)

    all_feats = []
    for clip in clips:
        feat = extract_features_from_clip(clip, model)
        all_feats.append(feat)

    all_feats = np.stack(all_feats)  # shape = [T', D]

    # 🔥 Interpolate to match total number of video frames
    interpolated_feats = zoom(all_feats, (total_frames / all_feats.shape[0], 1), order=1)  # [T, D]

    out_path = os.path.join(OUT_DIR, vid_file.replace('.mp4', '.npy'))
    np.save(out_path, interpolated_feats)


Processing: S3_CofHoney_C1.mp4
Processing: S2_Hotdog_C1.mp4
Processing: S4_Tea_C1.mp4
Processing: S4_CofHoney_C1.mp4
Processing: S3_Hotdog_C1.mp4
Processing: S2_Peanut_C1.mp4
Processing: S3_Coffee_C1.mp4
Processing: S1_Pealate_C1.mp4
Processing: S1_CofHoney_C1.mp4
Processing: S2_CofHoney_C1.mp4
Processing: S2_Tea_C1.mp4
Processing: S2_Pealate_C1.mp4
Processing: S3_Cheese_C1.mp4
Processing: S1_Coffee_C1.mp4
Processing: S2_Coffee_C1.mp4
Processing: S3_Tea_C1.mp4
Processing: S2_Cheese_C1.mp4
Processing: S4_Coffee_C1.mp4
Processing: S3_Peanut_C1.mp4
Processing: S4_Hotdog_C1.mp4
Processing: S1_Cheese_C1.mp4
Processing: S3_Pealate_C1.mp4
Processing: S1_Tea_C1.mp4
Processing: S4_Cheese_C1.mp4
Processing: S1_Hotdog_C1.mp4
Processing: S1_Peanut_C1.mp4
Processing: S4_Peanut_C1.mp4
Processing: S4_Pealate_C1.mp4


In [23]:
import os
import torch
import torch.nn as nn
import numpy as np
import decord
from decord import VideoReader, cpu
from tqdm import tqdm
from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize

# Import VideoMAEv2 model
from VideoMAEv2.models.modeling_pretrain import pretrain_videomae_base_patch16_224 as VideoMAEv2

# -----------------------------
# Configurations
# -----------------------------
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
VIDEO_DIR = '/home/cair/Dharmendra/gg/gtea/Videos'
OUT_DIR = '/home/cair/Dharmendra/features/videomaev2_temporal2'
CHECKPOINT_PATH = './converted/videomaev2-base-p16-16f-pretrain.pth'

os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# Temporal Encoder (Conv1D)
# -----------------------------
class TemporalConvEncoder(nn.Module):
    def __init__(self, in_dim=1536, hidden_dim=512, out_dim=768, kernel_size=3):
        super(TemporalConvEncoder, self).__init__()
        self.temporal_encoder = nn.Sequential(
            nn.Conv1d(in_dim, hidden_dim, kernel_size, padding=kernel_size//2),
            nn.ReLU(),
            nn.Conv1d(hidden_dim, out_dim, kernel_size, padding=kernel_size//2),
        )

    def forward(self, x):  # x: [T, D]
        x = x.transpose(0, 1).unsqueeze(0)   # [1, D, T]
        x = self.temporal_encoder(x)
        x = x.squeeze(0).transpose(0, 1)     # [T, D]
        return x

# -----------------------------
# Load VideoMAEv2 Pretrained Model
# -----------------------------
def load_videomae_encoder():
    model = VideoMAEv2()
    checkpoint = torch.load(CHECKPOINT_PATH, map_location='cpu')

    if 'model' in checkpoint:
        state_dict = checkpoint['model']
    else:
        state_dict = checkpoint

    model.load_state_dict(state_dict, strict=False)
    model.to(DEVICE)
    model.eval()
    return model

# -----------------------------
# Feature Extraction Pipeline
# -----------------------------
def extract_features_from_video(model, video_path, clip_len=16, stride=8):
    vr = VideoReader(video_path, ctx=cpu(0))
    total_frames = len(vr)
    all_feats = []

    transform = Compose([
        Resize(256),
        CenterCrop(224),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
    ])

    for start_idx in range(0, total_frames - clip_len + 1, stride):
        try:
            # Get clip frames
            frame_indices = list(range(start_idx, start_idx + clip_len))
            clip = vr.get_batch(frame_indices).asnumpy()
            
            # Process each frame
            processed_frames = []
            for frame in clip:
                pil_frame = Image.fromarray(frame)
                transformed_frame = transform(pil_frame)
                processed_frames.append(transformed_frame)
            
            # Prepare clip tensor [1, C, T, H, W]
            clip = torch.stack(processed_frames)  # [T, C, H, W]
            clip = clip.unsqueeze(0).to(DEVICE)  # [1, T, C, H, W]
            clip = clip.permute(0, 2, 1, 3, 4)   # [1, C, T, H, W]

            # Try different mask approaches
            try:
                # First try: no mask
                with torch.no_grad():
                    outputs = model(clip)
            except TypeError:
                try:
                    # Second try: mask with 1568 patches
                    mask = torch.zeros((1, 1568), dtype=torch.bool).to(DEVICE)
                    with torch.no_grad():
                        outputs = model(clip, mask)
                except RuntimeError:
                    # Third try: mask with 3136 patches
                    mask = torch.zeros((1, 3136), dtype=torch.bool).to(DEVICE)
                    with torch.no_grad():
                        outputs = model(clip, mask)
            
            # Handle model outputs
            if isinstance(outputs, tuple):
                features = outputs[0]  # Assume first element is features
            else:
                features = outputs
            
            # Extract features (using CLS token or average pooling)
            if features.dim() == 3:  # [batch, seq_len, dim]
                feat = features[:, 0]  # CLS token
            elif features.dim() == 4:  # [batch, channels, height, width]
                feat = features.mean(dim=[2, 3])  # Global average pooling
            else:
                feat = features.mean(dim=1)  # Average pooling
            
            all_feats.append(feat.squeeze(0).cpu())
            
        except Exception as e:
            print(f"Error processing clip starting at frame {start_idx}: {e}")
            continue

    if len(all_feats) == 0:
        return None

    return torch.stack(all_feats)  # [T, D]

# -----------------------------
# Main Processing Loop
# -----------------------------
def main():
    # Initialize models
    model = load_videomae_encoder()
    temporal_encoder = TemporalConvEncoder().to(DEVICE)
    temporal_encoder.eval()

    # Get video files
    video_files = [f for f in os.listdir(VIDEO_DIR) if f.endswith(('.mp4', '.avi'))]
    
    # Process each video
    for vid_file in tqdm(video_files, desc="Extracting Features"):
        vid_path = os.path.join(VIDEO_DIR, vid_file)
        output_path = os.path.join(OUT_DIR, vid_file.replace('.mp4', '.npy').replace('.avi', '.npy'))

        try:
            # Extract raw features
            raw_feats = extract_features_from_video(model, vid_path)
            
            if raw_feats is not None:
                # Apply temporal encoding
                with torch.no_grad():
                    feats = temporal_encoder(raw_feats.to(DEVICE)).cpu().numpy()
                
                # Save features
                np.save(output_path, feats)
                print(f"Successfully processed {vid_file}: {feats.shape}")
            else:
                print(f"Skipping {vid_file} due to processing errors")
        except Exception as e:
            print(f"Error processing {vid_file}: {e}")

if __name__ == '__main__':
    main()

Extracting Features:   4%|▎         | 1/28 [00:29<13:17, 29.52s/it]

Successfully processed S3_CofHoney_C1.mp4: (110, 768)


Extracting Features:   7%|▋         | 2/28 [00:54<11:43, 27.07s/it]

Successfully processed S2_Hotdog_C1.mp4: (100, 768)


Extracting Features:  11%|█         | 3/28 [01:31<13:05, 31.43s/it]

Successfully processed S4_Tea_C1.mp4: (142, 768)


Extracting Features:  14%|█▍        | 4/28 [01:58<11:55, 29.82s/it]

Successfully processed S4_CofHoney_C1.mp4: (107, 768)


Extracting Features:  18%|█▊        | 5/28 [02:25<11:02, 28.82s/it]

Successfully processed S3_Hotdog_C1.mp4: (106, 768)


Extracting Features:  21%|██▏       | 6/28 [03:10<12:32, 34.22s/it]

Successfully processed S2_Peanut_C1.mp4: (182, 768)


Extracting Features:  25%|██▌       | 7/28 [03:47<12:13, 34.94s/it]

Successfully processed S3_Coffee_C1.mp4: (147, 768)


Extracting Features:  29%|██▊       | 8/28 [04:32<12:44, 38.21s/it]

Successfully processed S1_Pealate_C1.mp4: (172, 768)


Extracting Features:  32%|███▏      | 9/28 [05:12<12:20, 38.99s/it]

Successfully processed S1_CofHoney_C1.mp4: (153, 768)


Extracting Features:  36%|███▌      | 10/28 [05:37<10:24, 34.68s/it]

Successfully processed S2_CofHoney_C1.mp4: (101, 768)


Extracting Features:  39%|███▉      | 11/28 [06:22<10:41, 37.72s/it]

Successfully processed S2_Tea_C1.mp4: (175, 768)


Extracting Features:  43%|████▎     | 12/28 [07:01<10:09, 38.08s/it]

Successfully processed S2_Pealate_C1.mp4: (146, 768)


Extracting Features:  46%|████▋     | 13/28 [07:31<08:54, 35.61s/it]

Successfully processed S3_Cheese_C1.mp4: (113, 768)


Extracting Features:  50%|█████     | 14/28 [08:09<08:27, 36.22s/it]

Successfully processed S1_Coffee_C1.mp4: (146, 768)


Extracting Features:  54%|█████▎    | 15/28 [09:07<09:18, 42.97s/it]

Successfully processed S2_Coffee_C1.mp4: (225, 768)


Extracting Features:  57%|█████▋    | 16/28 [09:51<08:39, 43.33s/it]

Successfully processed S3_Tea_C1.mp4: (169, 768)


Extracting Features:  61%|██████    | 17/28 [10:12<06:40, 36.42s/it]

Successfully processed S2_Cheese_C1.mp4: (78, 768)


Extracting Features:  64%|██████▍   | 18/28 [10:42<05:45, 34.57s/it]

Successfully processed S4_Coffee_C1.mp4: (119, 768)


Extracting Features:  68%|██████▊   | 19/28 [11:12<04:58, 33.13s/it]

Successfully processed S3_Peanut_C1.mp4: (119, 768)


Extracting Features:  71%|███████▏  | 20/28 [11:33<03:56, 29.53s/it]

Successfully processed S4_Hotdog_C1.mp4: (80, 768)


Extracting Features:  75%|███████▌  | 21/28 [12:04<03:29, 29.91s/it]

Successfully processed S1_Cheese_C1.mp4: (116, 768)


Extracting Features:  79%|███████▊  | 22/28 [12:42<03:14, 32.33s/it]

Successfully processed S3_Pealate_C1.mp4: (145, 768)


Extracting Features:  82%|████████▏ | 23/28 [13:46<03:29, 41.87s/it]

Successfully processed S1_Tea_C1.mp4: (250, 768)


Extracting Features:  86%|████████▌ | 24/28 [14:11<02:27, 36.95s/it]

Successfully processed S4_Cheese_C1.mp4: (99, 768)


Extracting Features:  89%|████████▉ | 25/28 [14:34<01:38, 32.78s/it]

Successfully processed S1_Hotdog_C1.mp4: (88, 768)


Extracting Features:  93%|█████████▎| 26/28 [15:27<01:17, 38.88s/it]

Successfully processed S1_Peanut_C1.mp4: (204, 768)


Extracting Features:  96%|█████████▋| 27/28 [15:58<00:36, 36.26s/it]

Successfully processed S4_Peanut_C1.mp4: (115, 768)


Extracting Features: 100%|██████████| 28/28 [16:36<00:00, 35.60s/it]

Successfully processed S4_Pealate_C1.mp4: (154, 768)





In [25]:
import os
import torch
import torch.nn as nn
import numpy as np
from PIL import Image
from decord import VideoReader, cpu
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from tqdm import tqdm

# Import your pretrained model (make sure this import works)
from VideoMAEv2.models.modeling_pretrain import pretrain_videomae_base_patch16_224 as VideoMAEv2

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
VIDEO_DIR = '/home/cair/Dharmendra/gg/gtea/Videos'  # Change as needed
OUT_DIR = '/home/cair/Dharmendra/features/videomaev2_temporal3'
CHECKPOINT_PATH = './converted/videomaev2-base-p16-16f-pretrain.pth'  # Change as needed
os.makedirs(OUT_DIR, exist_ok=True)

# Temporal conv encoder for smoothing features (optional)
class TemporalConvEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim=512, out_dim=768, kernel_size=3):
        super().__init__()
        self.temporal_encoder = nn.Sequential(
            nn.Conv1d(in_dim, hidden_dim, kernel_size, padding=kernel_size // 2),
            nn.ReLU(),
            nn.Conv1d(hidden_dim, out_dim, kernel_size, padding=kernel_size // 2),
        )
    def forward(self, x):
        # x shape: [T, D]
        x = x.transpose(0, 1).unsqueeze(0)   # [1, D, T]
        x = self.temporal_encoder(x)
        x = x.squeeze(0).transpose(0, 1)     # [T, D]
        return x

def load_videomae_encoder():
    model = VideoMAEv2()
    checkpoint = torch.load(CHECKPOINT_PATH, map_location='cpu')
    state_dict = checkpoint.get('model', checkpoint)
    model.load_state_dict(state_dict, strict=False)
    model.to(DEVICE)
    model.eval()
    return model

def extract_features_from_video(model, temporal_encoder, video_path, clip_len=16, stride=1):
    vr = VideoReader(video_path, ctx=cpu(0))
    total_frames = len(vr)
    all_feats = []

    transform = Compose([
        Resize(256),
        CenterCrop(224),
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406],
                  std=[0.229, 0.224, 0.225])
    ])

    for start_idx in range(0, total_frames - clip_len + 1, stride):
        clip = vr.get_batch(list(range(start_idx, start_idx + clip_len))).asnumpy()
        processed_frames = []
        for frame in clip:
            pil_frame = Image.fromarray(frame)
            processed_frames.append(transform(pil_frame))

        clip_tensor = torch.stack(processed_frames)  # [T, C, H, W]
        clip_tensor = clip_tensor.unsqueeze(0).to(DEVICE)  # [1, T, C, H, W]
        clip_tensor = clip_tensor.permute(0, 2, 1, 3, 4)   # [1, C, T, H, W]

        mask = torch.zeros((clip_tensor.shape[0], model.encoder.patch_embed.num_patches), dtype=torch.bool).to(DEVICE)

        with torch.no_grad():
            latent = model(clip_tensor, mask)  # [B, N_masked, dim]

        # latent shape depends on model, get features for CLS token or average
        if latent.dim() == 3:  # [B, seq_len, dim]
            feat = latent[:, 0, :]  # CLS token feature
        else:
            feat = latent.mean(dim=1)

        all_feats.append(feat.squeeze(0).cpu())

    if len(all_feats) == 0:
        return None

    feats = torch.stack(all_feats)  # [T, D]

    # Optionally smooth features with temporal encoder
    with torch.no_grad():
        feats = temporal_encoder(feats.to(DEVICE)).cpu()

    return feats.numpy()

def main():
    model = load_videomae_encoder()
    temporal_encoder = TemporalConvEncoder(in_dim=1536, out_dim=768).to(DEVICE)
    temporal_encoder.eval()

    video_files = [f for f in os.listdir(VIDEO_DIR) if f.endswith('.mp4') or f.endswith('.avi')]

    for vid_file in tqdm(video_files, desc="Extracting Features"):
        vid_path = os.path.join(VIDEO_DIR, vid_file)
        
        output_path = os.path.join(OUT_DIR, vid_file.rsplit('.',1)[0] + '.npy')
        if os.path.exists(output_path):
            print(f"Skipping {vid_file}, already extracted.")
            continue

        try:
            feats = extract_features_from_video(model, temporal_encoder, vid_path)
            if feats is not None:
                np.save(output_path, feats)
                print(f"Processed {vid_file} -> {feats.shape}")
            else:
                print(f"Skipping {vid_file}, no features extracted.")
        except Exception as e:
            print(f"Error processing {vid_file}: {e}")

if __name__ == '__main__':
    main()


Extracting Features:   0%|          | 0/28 [00:00<?, ?it/s]

Skipping S3_CofHoney_C1.mp4, already extracted.
Skipping S2_Hotdog_C1.mp4, already extracted.
Skipping S4_Tea_C1.mp4, already extracted.
Skipping S4_CofHoney_C1.mp4, already extracted.
Skipping S3_Hotdog_C1.mp4, already extracted.
Skipping S2_Peanut_C1.mp4, already extracted.
Skipping S3_Coffee_C1.mp4, already extracted.
Skipping S1_Pealate_C1.mp4, already extracted.


Extracting Features:  32%|███▏      | 9/28 [05:15<11:06, 35.05s/it]

Processed S1_CofHoney_C1.mp4 -> (1220, 768)


Extracting Features:  36%|███▌      | 10/28 [08:32<17:11, 57.31s/it]

Processed S2_CofHoney_C1.mp4 -> (808, 768)


Extracting Features:  39%|███▉      | 11/28 [14:22<29:48, 105.19s/it]

Processed S2_Tea_C1.mp4 -> (1397, 768)


Extracting Features:  43%|████▎     | 12/28 [28:53<1:06:43, 250.22s/it]

Processed S2_Pealate_C1.mp4 -> (1166, 768)


Extracting Features:  46%|████▋     | 13/28 [36:58<1:15:05, 300.34s/it]

Processed S3_Cheese_C1.mp4 -> (898, 768)


Extracting Features:  50%|█████     | 14/28 [41:59<1:10:06, 300.44s/it]

Processed S1_Coffee_C1.mp4 -> (1163, 768)


Extracting Features:  54%|█████▎    | 15/28 [49:39<1:13:42, 340.19s/it]

Processed S2_Coffee_C1.mp4 -> (1799, 768)


Extracting Features:  57%|█████▋    | 16/28 [55:25<1:08:22, 341.85s/it]

Processed S3_Tea_C1.mp4 -> (1346, 768)


Extracting Features:  61%|██████    | 17/28 [58:06<53:35, 292.35s/it]  

Processed S2_Cheese_C1.mp4 -> (619, 768)


Extracting Features:  64%|██████▍   | 18/28 [1:02:06<46:16, 277.67s/it]

Processed S4_Coffee_C1.mp4 -> (949, 768)


Extracting Features:  68%|██████▊   | 19/28 [1:06:43<41:38, 277.56s/it]

Processed S3_Peanut_C1.mp4 -> (949, 768)


Extracting Features:  71%|███████▏  | 20/28 [1:09:55<33:41, 252.66s/it]

Processed S4_Hotdog_C1.mp4 -> (640, 768)


Extracting Features:  75%|███████▌  | 21/28 [1:16:21<34:03, 291.87s/it]

Processed S1_Cheese_C1.mp4 -> (928, 768)


Extracting Features:  79%|███████▊  | 22/28 [1:25:42<37:07, 371.21s/it]

Processed S3_Pealate_C1.mp4 -> (1154, 768)


Extracting Features:  82%|████████▏ | 23/28 [1:36:44<38:07, 457.45s/it]

Processed S1_Tea_C1.mp4 -> (1994, 768)


Extracting Features:  86%|████████▌ | 24/28 [1:40:22<25:44, 386.07s/it]

Processed S4_Cheese_C1.mp4 -> (790, 768)


Extracting Features:  89%|████████▉ | 25/28 [1:43:50<16:39, 333.02s/it]

Processed S1_Hotdog_C1.mp4 -> (703, 768)


Extracting Features:  93%|█████████▎| 26/28 [1:51:14<12:12, 366.22s/it]

Processed S1_Peanut_C1.mp4 -> (1628, 768)


Extracting Features:  96%|█████████▋| 27/28 [1:55:06<05:26, 326.16s/it]

Processed S4_Peanut_C1.mp4 -> (919, 768)


Extracting Features: 100%|██████████| 28/28 [1:59:48<00:00, 256.75s/it]

Processed S4_Pealate_C1.mp4 -> (1229, 768)



