In [1]:
import zipfile
import os
# Unzip dataset
zip_path = '/content/drive/MyDrive/AML/HMDB_simp.zip'
extract_path = '/content/drive/MyDrive/AML'
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Dataset extracted to {extract_path}")

Dataset extracted to /content/drive/MyDrive/AML


In [2]:
import torch
import numpy as np
import json
from transformers import Trainer, TrainingArguments,EarlyStoppingCallback
from transformers.trainer_utils import IntervalStrategy
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)  # Suppress FutureWarnings
from torch import nn, optim
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training
import logging
import sys
import argparse
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
from sklearn.model_selection import train_test_split

from transformers import AutoImageProcessor, AutoModelForVideoClassification
import os
from transformers import TimesformerForVideoClassification

In [3]:
import shutil
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Paths
original_root = "/content/drive/MyDrive/AML/HMDB_simp"
subset_root = "/content/drive/MyDrive/AML/dataset/train"
csv_path = "/content/drive/MyDrive/AML/split/train.csv"

def copy_video(row):
    """Copy a single video"""
    label, video = row['class'], row['video_name']

    src_path = os.path.join(original_root, label, video)
    dst_path = os.path.join(subset_root, label, video)

    if os.path.exists(src_path):
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
        return True
    else:
        print(f"Warning: {src_path} does not exist.")
        return False

# Load CSV and copy files
df = pd.read_csv(csv_path)

# Method 1: Sequential (simple)
def sequential_copy():
    success_count = 0
    for _, row in df.iterrows():
        if copy_video(row):
            success_count += 1
    print(f"Copied {success_count}/{len(df)} videos")

# Method 2: Parallel (faster)
def parallel_copy(max_workers=8):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(copy_video, [row for _, row in df.iterrows()]))

    success_count = sum(results)
    print(f"Copied {success_count}/{len(df)} videos")

# Run parallel copy
if __name__ == "__main__":
    print(f"Copying {len(df)} videos...")
    parallel_copy()
    print("Done!")

Copying 900 videos...
Copied 900/900 videos
Done!


In [4]:
import shutil
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Paths
original_root = "/content/drive/MyDrive/AML/HMDB_simp"
subset_root = "/content/drive/MyDrive/AML/dataset/test"
csv_path = "/content/drive/MyDrive/AML/split/test.csv"

def copy_video(row):
    """Copy a single video"""
    label, video = row['class'], row['video_name']

    src_path = os.path.join(original_root, label, video)
    dst_path = os.path.join(subset_root, label, video)

    if os.path.exists(src_path):
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
        return True
    else:
        print(f"Warning: {src_path} does not exist.")
        return False

# Load CSV and copy files
df = pd.read_csv(csv_path)

# Method 1: Sequential (simple)
def sequential_copy():
    success_count = 0
    for _, row in df.iterrows():
        if copy_video(row):
            success_count += 1
    print(f"Copied {success_count}/{len(df)} videos")

# Method 2: Parallel (faster)
def parallel_copy(max_workers=8):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(copy_video, [row for _, row in df.iterrows()]))

    success_count = sum(results)
    print(f"Copied {success_count}/{len(df)} videos")

# Run parallel copy
if __name__ == "__main__":
    print(f"Copying {len(df)} videos...")
    parallel_copy()
    print("Done!")

Copying 125 videos...
Copied 125/125 videos
Done!


In [5]:
import shutil
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Paths
original_root = "/content/drive/MyDrive/AML/HMDB_simp"
subset_root = "/content/drive/MyDrive/AML/dataset/val"
csv_path = "/content/drive/MyDrive/AML/split/val.csv"

def copy_video(row):
    """Copy a single video"""
    label, video = row['class'], row['video_name']

    src_path = os.path.join(original_root, label, video)
    dst_path = os.path.join(subset_root, label, video)

    if os.path.exists(src_path):
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
        return True
    else:
        print(f"Warning: {src_path} does not exist.")
        return False

# Load CSV and copy files
df = pd.read_csv(csv_path)

# Method 1: Sequential (simple)
def sequential_copy():
    success_count = 0
    for _, row in df.iterrows():
        if copy_video(row):
            success_count += 1
    print(f"Copied {success_count}/{len(df)} videos")

# Method 2: Parallel (faster)
def parallel_copy(max_workers=8):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(copy_video, [row for _, row in df.iterrows()]))

    success_count = sum(results)
    print(f"Copied {success_count}/{len(df)} videos")

# Run parallel copy
if __name__ == "__main__":
    print(f"Copying {len(df)} videos...")
    parallel_copy()
    print("Done!")

Copying 225 videos...
Copied 225/225 videos
Done!


In [39]:
import torch
import numpy as np
import json
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import VivitImageProcessor, VivitForVideoClassification
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import random

class ViViTHMDBDataset(Dataset):
    def __init__(self, split_dir, clip_size=32, sampling_strategy='random', transform=None):
        """
        Dataset for ViViT model

        Args:
            split_dir (str): Path to split directory (train, val, or test folder)
            clip_size (int): Number of frames per clip (ViViT typically uses 32)
            sampling_strategy (str): 'random' or 'equidistant'
            transform: Torchvision transforms
        """
        self.split_dir = split_dir
        self.clip_size = clip_size
        self.sampling_strategy = sampling_strategy
        self.transform = transform

        # Get all class folders in the split directory
        self.classes = sorted([d for d in os.listdir(split_dir)
                              if os.path.isdir(os.path.join(split_dir, d))])
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}
        print(f"Found {len(self.classes)} classes")

        self.data = self._load_data()

    def _random_sample_frames(self, all_frames, num_frames):
        """Random sampling using indices"""
        num_available = len(all_frames)

        if num_available >= num_frames:
            random_indices = random.sample(range(num_available), num_frames)
            return [all_frames[i] for i in random_indices]
        else:
            random_indices = random.choices(range(num_available), k=num_frames)
            return [all_frames[i] for i in random_indices]

    def _equidistant_sample_frames(self, all_frames, num_frames):
        """Equidistant sampling using indices"""
        num_available = len(all_frames)

        if num_available <= num_frames:
            indices = []
            for i in range(num_frames):
                indices.append(i % num_available)
            return [all_frames[i] for i in indices]
        else:
            step = num_available / num_frames
            indices = [int(i * step) for i in range(num_frames)]
            return [all_frames[i] for i in indices]

    def _vivit_uniform_sample_frames(self, all_frames, num_frames, stride=2):
        """
        ViViT paper uniform temporal sampling
        Sample with stride (every 2nd frame) as described in paper
        """
        num_available = len(all_frames)
        total_needed = num_frames * stride

        if num_available >= total_needed:
            # Center crop and sample with stride
            start_idx = (num_available - total_needed) // 2
            indices = [start_idx + i * stride for i in range(num_frames)]
            return [all_frames[i] for i in indices]
        else:
            # Uniform sampling across available frames
            step = num_available / num_frames
            indices = [int(i * step) for i in range(num_frames)]
            return [all_frames[i] for i in indices]

    def _load_data(self):
        """Load data from the split directory"""
        data = []

        for class_name in self.classes:
            class_path = os.path.join(self.split_dir, class_name)
            label = self.class_to_idx[class_name]

            video_folders = [v for v in os.listdir(class_path)
                           if os.path.isdir(os.path.join(class_path, v))]

            for video_folder in video_folders:
                video_path = os.path.join(class_path, video_folder)

                if '_' in video_folder and video_folder.split('_')[-1].isdigit():
                    continue

                all_frames = sorted([f for f in os.listdir(video_path)
                                   if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))])

                if len(all_frames) == 0:
                    continue

                # Select sampling strategy
                if self.sampling_strategy == 'random':
                    clip_frames = self._random_sample_frames(all_frames, self.clip_size)
                elif self.sampling_strategy == 'equidistant':
                    clip_frames = self._equidistant_sample_frames(all_frames, self.clip_size)
                elif self.sampling_strategy == 'vivit_uniform':
                    clip_frames = self._vivit_uniform_sample_frames(all_frames, self.clip_size)
                else:
                    clip_frames = self._random_sample_frames(all_frames, self.clip_size)

                data.append((video_path, label, 0, clip_frames))

        print(f"Total clips: {len(data)}")
        return data

    def __len__(self):
        return len(self.data)

    def _get_fallback_frame(self, video_path):
        """Get a fallback frame when specific frame loading fails"""
        available_frames = [f for f in os.listdir(video_path)
                          if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
        if available_frames:
            return Image.open(os.path.join(video_path, available_frames[0]))
        else:
            return Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))

    def _load_regular_frame(self, video_path, frame_name):
        """Load a regular frame from disk"""
        frame_path = os.path.join(video_path, frame_name)
        if os.path.exists(frame_path):
            try:
                return Image.open(frame_path)
            except Exception as e:
                return self._get_fallback_frame(video_path)
        else:
            return self._get_fallback_frame(video_path)

    def _load_frames_from_clip(self, video_path, clip_frames):
        """Load frames for a specific clip"""
        frames = []

        for frame_name in clip_frames:
            img = self._load_regular_frame(video_path, frame_name)
            frames.append(img)

        while len(frames) < self.clip_size:
            if frames:
                frames.append(frames[-1])
            else:
                frames.append(Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8)))

        frames = frames[:self.clip_size]
        return frames

    def __getitem__(self, idx):
        video_path, label, clip_idx, clip_frames = self.data[idx]

        frames = self._load_frames_from_clip(video_path, clip_frames)

        if self.transform:
            frames = [self.transform(frame) for frame in frames]

        return torch.stack(frames), label

def get_vivit_dataloader(root_dir, batch_size=8, clip_size=32, sampling_strategy='vivit_uniform'):
    """
    Create dataloaders for ViViT model

    Args:
        root_dir (str): Path to directory containing train, val, test folders
        batch_size (int): Batch size (smaller for ViViT due to memory requirements)
        clip_size (int): Number of frames per clip (32 for ViViT)
        sampling_strategy (str): Sampling strategy to use

    Returns:
        tuple: (train_loader, val_loader, test_loader)
    """
    # ViViT preprocessing - matches paper specifications
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_dir = os.path.join(root_dir, 'train')
    val_dir = os.path.join(root_dir, 'val')
    test_dir = os.path.join(root_dir, 'test')

    for split_dir, name in [(train_dir, 'train'), (val_dir, 'val'), (test_dir, 'test')]:
        if not os.path.exists(split_dir):
            raise ValueError(f"{name} directory not found at {split_dir}")

    train_dataset = ViViTHMDBDataset(
        split_dir=train_dir,
        clip_size=clip_size,
        sampling_strategy=sampling_strategy,
        transform=transform
    )

    val_dataset = ViViTHMDBDataset(
        split_dir=val_dir,
        clip_size=clip_size,
        sampling_strategy=sampling_strategy,
        transform=transform
    )

    test_dataset = ViViTHMDBDataset(
        split_dir=test_dir,
        clip_size=clip_size,
        sampling_strategy=sampling_strategy,
        transform=transform
    )

    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True,
        num_workers=0, pin_memory=False
    )
    val_loader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False,
        num_workers=0, pin_memory=False
    )
    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False,
        num_workers=0, pin_memory=False
    )

    print(f"Train: {len(train_dataset)} | Val: {len(val_dataset)} | Test: {len(test_dataset)}")
    return train_loader, val_loader, test_loader

def load_vivit_model():
    """
    Load the pre-trained ViViT model for video classification
    """
    import logging
    warnings.filterwarnings("ignore", message="Some weights of VivitForVideoClassification were not initialized")
    logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

    # HMDB-51 subset classes
    label_index_dict = {
        'brush_hair': 0, 'cartwheel': 1, 'catch': 2, 'chew': 3, 'climb': 4,
        'climb_stairs': 5, 'draw_sword': 6, 'eat': 7, 'fencing': 8, 'flic_flac': 9,
        'golf': 10, 'handstand': 11, 'kiss': 12, 'pick': 13, 'pour': 14,
        'pullup': 15, 'pushup': 16, 'ride_bike': 17, 'shoot_bow': 18, 'shoot_gun': 19,
        'situp': 20, 'smile': 21, 'smoke': 22, 'throw': 23, 'wave': 24
    }

    index_label_dict = {v: k for k, v in label_index_dict.items()}

    # Load ViViT processor and model
    try:
        processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
        ckpt = "google/vivit-b-16x2-kinetics400"
    except:
        # Fallback if the exact model isn't available
        print("Using alternative ViViT model...")
        processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
        ckpt = "google/vivit-b-16x2-kinetics400"

    model = VivitForVideoClassification.from_pretrained(
        ckpt,
        label2id=label_index_dict,
        id2label=index_label_dict,
        ignore_mismatched_sizes=True
    )

    return processor, model

def compute_metrics(eval_pred):
    """Compute evaluation metrics"""
    predictions, labels = eval_pred

    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    # Top-1 accuracy
    top1_predictions = np.argmax(predictions, axis=1)
    top1_accuracy = np.mean(top1_predictions == labels)

    # Top-5 accuracy
    top5_predictions = np.argsort(predictions, axis=1)[:, -5:]
    top5_correct = np.array([labels[i] in top5_predictions[i] for i in range(len(labels))])
    top5_accuracy = np.mean(top5_correct)

    return {
        "top1_accuracy": top1_accuracy,
        "top5_accuracy": top5_accuracy,
        "eval_top1_accuracy": top1_accuracy,
        "eval_top5_accuracy": top5_accuracy,
    }

def train_vivit_model(data_dir, epochs=8, batch_size=8, learning_rate=0.1,
                     clip_size=32, sampling_strategy='vivit_uniform', optimizer_type='sgd'):
    """
    Train ViViT model with Trainer API

    Args:
        data_dir (str): Path to dataset directory
        epochs (int): Number of training epochs
        batch_size (int): Batch size (small for ViViT due to memory)
        learning_rate (float): Learning rate (lower for ViViT)
        clip_size (int): Number of frames per clip
        sampling_strategy (str): Temporal sampling strategy
        optimizer_type (str): 'adamw' or 'sgd'
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    print(f"ViViT Configuration:")
    print(f"  - Frames per clip: {clip_size}")
    print(f"  - Batch size: {batch_size}")
    print(f"  - Learning rate: {learning_rate}")
    print(f"  - Sampling strategy: {sampling_strategy}")
    print(f"  - Optimizer: {optimizer_type}")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Load data with ViViT-specific parameters
    train_loader, val_loader, test_loader = get_vivit_dataloader(
        data_dir,
        batch_size=batch_size,
        clip_size=clip_size,
        sampling_strategy=sampling_strategy
    )

    processor, model = load_vivit_model()

    train_dataset = train_loader.dataset
    val_dataset = val_loader.dataset

    def data_collator(batch):
        videos = torch.stack([item[0] for item in batch])
        labels = torch.tensor([item[1] for item in batch])
        return {
            'pixel_values': videos,
            'labels': labels
        }

    # Choose optimizer
    if optimizer_type.lower() == 'adamw':
        optimizer = optim.AdamW(
            model.parameters(),
            lr=learning_rate,
            weight_decay=1e-4,
            betas=(0.9, 0.999)
        )
    else:  # SGD
        optimizer = optim.SGD(
            model.parameters(),
            lr=learning_rate,
            momentum=0.9,
            weight_decay=1e-3
        )

    # Create output directory
    output_dir = f"/content/drive/MyDrive/AML/vivit_results_{sampling_strategy}_{optimizer_type}"
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'logs'), exist_ok=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        eval_strategy='epoch',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        logging_dir=os.path.join(output_dir, 'logs'),
        logging_strategy="epoch",
        save_strategy='epoch',
        save_total_limit=1,
        remove_unused_columns=False,
        load_best_model_at_end=True,
        metric_for_best_model='eval_top1_accuracy',
        greater_is_better=True,
        report_to="tensorboard",
        push_to_hub=False,
        save_only_model=True,
        run_name=f"vivit_{sampling_strategy}_{optimizer_type}",
        gradient_accumulation_steps=4,  # Help with small batch size
        fp16=True,  # Mixed precision for memory efficiency
        dataloader_pin_memory=False,
        dataloader_num_workers=0,
    )

    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=4)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        optimizers=(optimizer, None),
        eval_dataset=val_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping_callback]
    )

    print("Starting ViViT training...")
    trainer.train()

    print("\nEvaluating ViViT on test set...")
    test_dataset = test_loader.dataset
    test_results = trainer.evaluate(eval_dataset=test_dataset)

    print(f"\n=== FINAL ViViT TEST RESULTS ===")
    print(f"Configuration: {clip_size} frames, {sampling_strategy} sampling, {optimizer_type} optimizer")
    print(f"Test Top-1 Accuracy: {test_results.get('eval_top1_accuracy', 'N/A'):.4f}")
    print(f"Test Top-5 Accuracy: {test_results.get('eval_top5_accuracy', 'N/A'):.4f}")
    print(f"Test Loss: {test_results.get('eval_loss', 'N/A'):.4f}")

    # Save model
    model_save_path = f"/content/drive/MyDrive/AML/vivit_model_{sampling_strategy}_{optimizer_type}"
    trainer.save_model(model_save_path)

    # Save results
    results_save_path = f"/content/drive/MyDrive/AML/vivit_results_{sampling_strategy}_{optimizer_type}.json"
    with open(results_save_path, "w") as f:
        json.dump(test_results, f, indent=2)

    print(f"ViViT training complete.")
    print(f"Model saved to: {model_save_path}")
    print(f"Results saved to: {results_save_path}")

    return test_results

# Clean up before training
def setup_vivit_training():
    """Setup environment for ViViT training"""
    torch.cuda.empty_cache()
    print("GPU memory cleared for ViViT training")

# Usage Examples:

def train_vivit_paper_config():
    setup_vivit_training()
    return train_vivit_model(
        data_dir='/content/drive/MyDrive/AML/dataset',
        epochs=8,
        batch_size=8,  # Small due to 32 frames
        learning_rate=0.1,
        clip_size=32,  # ViViT paper uses 32 frames
        sampling_strategy='vivit_uniform',
        optimizer_type='sgd'
    )
train_vivit_paper_config()

GPU memory cleared for ViViT training
Using device: cuda
ViViT Configuration:
  - Frames per clip: 32
  - Batch size: 8
  - Learning rate: 0.1
  - Sampling strategy: vivit_uniform
  - Optimizer: sgd
Found 25 classes
Total clips: 900
Found 25 classes
Total clips: 225
Found 25 classes
Total clips: 125
Train: 900 | Val: 225 | Test: 125
Starting ViViT training...


Epoch,Training Loss,Validation Loss,Top1 Accuracy,Top5 Accuracy
1,1.3595,0.762734,0.8,0.951111
2,0.4684,0.926487,0.702222,0.96
3,0.2781,0.776938,0.76,0.96
4,0.1097,0.588663,0.844444,0.973333
5,0.076,0.560717,0.817778,0.968889
6,0.0293,0.585824,0.835556,0.968889
7,0.0108,0.559441,0.853333,0.96
8,0.0023,0.536043,0.866667,0.968889



Evaluating ViViT on test set...



=== FINAL ViViT TEST RESULTS ===
Configuration: 32 frames, vivit_uniform sampling, sgd optimizer
Test Top-1 Accuracy: 0.8880
Test Top-5 Accuracy: 0.9920
Test Loss: 0.4021
ViViT training complete.
Model saved to: /content/drive/MyDrive/AML/vivit_model_vivit_uniform_sgd
Results saved to: /content/drive/MyDrive/AML/vivit_results_vivit_uniform_sgd.json


{'eval_top1_accuracy': 0.888,
 'eval_top5_accuracy': 0.992,
 'eval_loss': 0.4020753502845764,
 'eval_runtime': 19.1352,
 'eval_samples_per_second': 6.532,
 'eval_steps_per_second': 0.836,
 'epoch': 8.0}