In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# install packages needed for TimeSformer
! pip install torchvision
! pip install 'git+https://github.com/facebookresearch/fvcore'
! pip install simplejson
! pip install einops
! pip install timm
! pip install psutil
! pip install scikit-learn
! pip install opencv-python
! pip install tensorboard
! pip install av

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchvision)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86

In [None]:
! git clone https://github.com/yiyixuxu/TimeSformer.git

Cloning into 'TimeSformer'...
remote: Enumerating objects: 503, done.[K
remote: Counting objects: 100% (165/165), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 503 (delta 117), reused 90 (delta 90), pack-reused 338 (from 1)[K
Receiving objects: 100% (503/503), 2.89 MiB | 29.31 MiB/s, done.
Resolving deltas: 100% (200/200), done.


In [None]:
%cd TimeSformer/

/content/TimeSformer


In [None]:
import torch
import cv2
from PIL import Image
import os
from torchvision import transforms
from timesformer.models.vit import TimeSformer

# === Configuration ===
clip_size = 16
class_names = ['Fight', 'NonFight']  # Update as needed

# === Transform ===
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.45, 0.45, 0.45], std=[0.225, 0.225, 0.225])
])

# === Load Pre-trained Model ===
model = TimeSformer(
    img_size=224,
    num_classes=2,
    num_frames=clip_size,
    attention_type='divided_space_time'
).cuda()

checkpoint = torch.load('/content/drive/MyDrive/timesformer/pretrained/final.pth')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === Helper to Predict a Clip ===
def predict_clip(frames):
    # Stack the frames and apply transformations
    transformed = torch.stack([
        transform(Image.fromarray(cv2.cvtColor(f, cv2.COLOR_BGR2RGB)))  # (3, 224, 224)
        for f in frames
    ])  # (16, 3, 224, 224)

    # Permute the tensor to match the model's expected input: (1, 16, 3, 224, 224)
    transformed = transformed.permute(1, 0, 2, 3)  # (3, 16, 224, 224) -> (1, 16, 3, 224, 224)
    transformed = transformed.unsqueeze(0)  # Add batch dimension -> (1, 16, 3, 224, 224)

    # Move to the correct device (GPU or CPU)
    transformed = transformed.to(device)

    with torch.no_grad():
        # Forward pass through the model
        logits = model(transformed)  # (1, num_classes)
        probs = torch.softmax(logits, dim=-1)  # (1, num_classes)
        pred_class = torch.argmax(probs, dim=-1).item()

    return class_names[pred_class], probs.squeeze().cpu()



# === Main Processing ===
def process_video(video_path, output_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'XVID'), fps, (w, h))

    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)

        if len(frames) == clip_size:
            label, _ = predict_clip(frames)
            print(f"Prediction: {label}")
            for f in frames:
                cv2.putText(f, label, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                out.write(f)
            frames = []  # Reset for next 16-frame chunk

    cap.release()
    out.release()
    print(f"Inference video saved to: {output_path}")

# === Example Usage ===
process_video('/content/drive/MyDrive/demo.avi', '/content/drive/MyDrive/demo_output.avi')


Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth" to /root/.cache/torch/hub/checkpoints/jx_vit_base_p16_224-80ecf9dd.pth


Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: NonFight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction: Fight
Prediction