In [1]:
!pip install transformers datasets accelerate torch torchvision

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [9

In [5]:
!wget --no-check-certificate https://www.crcv.ucf.edu/data/UCF101/UCF101.rar
!unrar x UCF101.rar

--2024-05-23 08:21:16--  https://www.crcv.ucf.edu/data/UCF101/UCF101.rar
Resolving www.crcv.ucf.edu (www.crcv.ucf.edu)... 132.170.214.127
Connecting to www.crcv.ucf.edu (www.crcv.ucf.edu)|132.170.214.127|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 6932971618 (6.5G) [application/rar]
Saving to: ‘UCF101.rar.1’

UCF101.rar.1         22%[===>                ]   1.43G  85.9MB/s    eta 59s    ^C

UNRAR 6.11 beta 1 freeware      Copyright (c) 1993-2022 Alexander Roshal


Extracting from UCF101.rar


Would you like to replace the existing file UCF-101/ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01.avi
294566 bytes, modified on 2012-10-01 08:15
with a new one
294566 bytes, modified on 2012-10-01 08:15

[Y]es, [N]o, [A]ll, n[E]ver, [R]ename, [Q]uit 
User break

User break


In [6]:
import os
import cv2
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

class UCF101Dataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.video_files = self._get_video_files(data_dir)

        if len(self.video_files) == 0:
            raise ValueError(f"No video files found in directory: {data_dir}")

    def _get_video_files(self, dir):
        video_files = []
        for root, _, files in os.walk(dir):
            for file in files:
                if file.endswith('.avi'):
                    video_files.append(os.path.join(root, file))
        return video_files

    def __len__(self):
        return len(self.video_files)

    def __getitem__(self, idx):
        video_file = self.video_files[idx]
        cap = cv2.VideoCapture(video_file)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            if self.transform:
                frame = self.transform(frame)
            frames.append(frame)
        cap.release()
        if len(frames) == 0:
            raise ValueError(f"No frames extracted from video: {video_file}")
        frames = torch.stack(frames)
        label = os.path.basename(video_file).split('_')[1]  # Adjust label extraction based on actual file naming
        return frames, label

In [7]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

data_dir = 'UCF-101'
dataset = UCF101Dataset(data_dir=data_dir, transform=transform)

In [8]:
def collate_fn(batch):
    videos, labels = zip(*batch)
    max_len = max(video.size(0) for video in videos)

    padded_videos = []
    for video in videos:
        pad_size = max_len - video.size(0)
        padded_video = torch.nn.functional.pad(video, (0, 0, 0, 0, 0, pad_size), value=0)
        padded_videos.append(padded_video)

    padded_videos = torch.stack(padded_videos)
    labels = torch.tensor(labels)
    return padded_videos, labels

In [9]:
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [1]:
# Print information about the dataset
print(f"Number of video files: {len(dataset)}")
print(f"Sample video files: {dataset.video_files[10000:10005]}")

# Check a sample batch from the dataloader
for batch in dataloader:
    frames, labels = batch
    print(f"Batch size: {frames.size(0)}")
    print(f"Frame tensor shape: {frames.shape}")
    print(f"Labels: {labels}")
    break  # Only check the first batch

NameError: name 'dataset' is not defined

In [None]:
from transformers import VideoMAEForVideoClassification, VideoMAEFeatureExtractor, TrainingArguments, Trainer

# Load model and feature extractor
model = VideoMAEForVideoClassification.from_pretrained('MCG-NJU/videomae-base-finetuned-kinetics')
feature_extractor = VideoMAEFeatureExtractor.from_pretrained('MCG-NJU/videomae-base-finetuned-kinetics')

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=feature_extractor,
    data_collator=data_collator
)

trainer.train()