<a href="https://colab.research.google.com/github/SanketAinapure/Transformer/blob/main/MR_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision transformers opencv-python numpy tqdm onnx onnxruntime seaborn matplotlib
!pip install simplejson
!pip install einops
!pip install timm
!pip install psutil
!pip install scikit-learn
!pip install tensorboard

Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metad

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
import cv2
import os
import numpy as np
from transformers import AutoImageProcessor, TimesformerForVideoClassification

In [None]:
class HARDataset(Dataset):
    def __init__(self, root_dir, processor, num_frames=8):
        self.root_dir = root_dir
        self.processor = processor
        self.num_frames = num_frames
        self.classes = sorted(os.listdir(root_dir))  # Class names
        self.video_paths = []
        self.labels = []

        for class_idx, class_name in enumerate(self.classes):
            class_path = os.path.join(root_dir, class_name)
            for video_name in os.listdir(class_path):
                video_path = os.path.join(class_path, video_name)
                self.video_paths.append(video_path)
                self.labels.append(class_idx)

    def __len__(self):
        return len(self.video_paths)

    def load_video_frames(self, video_path):
        """Extract `num_frames` evenly spaced frames from the video"""
        cap = cv2.VideoCapture(video_path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_indices = np.linspace(0, frame_count - 1, self.num_frames, dtype=int)
        frames = []

        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
            frames.append(frame)

        cap.release()

        # If not enough frames, duplicate the last frame
        if len(frames) < self.num_frames:
            frames += [frames[-1]] * (self.num_frames - len(frames))

        return frames

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.load_video_frames(video_path)
        inputs = self.processor(frames, return_tensors="pt")
        pixel_values = inputs["pixel_values"].squeeze(0)  # Shape: (num_frames, C, H, W)

        return pixel_values, torch.tensor(label, dtype=torch.long)



In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"sanketainapure","key":"50fdd54693e572871e7c6034dd32b4d5"}'}

In [None]:
import os
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d sharjeelmazhar/human-activity-recognition-video-dataset

Dataset URL: https://www.kaggle.com/datasets/sharjeelmazhar/human-activity-recognition-video-dataset
License(s): CC-BY-NC-SA-4.0
Downloading human-activity-recognition-video-dataset.zip to /content
100% 14.8G/14.8G [03:32<00:00, 113MB/s] 
100% 14.8G/14.8G [03:32<00:00, 74.8MB/s]


In [None]:
!unzip human-activity-recognition-video-dataset.zip -d ./data

Archive:  human-activity-recognition-video-dataset.zip
  inflating: ./data/Human Activity Recognition - Video Dataset/Clapping/Clapping (1).mp4  
  inflating: ./data/Human Activity Recognition - Video Dataset/Clapping/Clapping (10).mp4  
  inflating: ./data/Human Activity Recognition - Video Dataset/Clapping/Clapping (100).mp4  
  inflating: ./data/Human Activity Recognition - Video Dataset/Clapping/Clapping (101).mp4  
  inflating: ./data/Human Activity Recognition - Video Dataset/Clapping/Clapping (102).mp4  
  inflating: ./data/Human Activity Recognition - Video Dataset/Clapping/Clapping (103).mp4  
  inflating: ./data/Human Activity Recognition - Video Dataset/Clapping/Clapping (104).mp4  
  inflating: ./data/Human Activity Recognition - Video Dataset/Clapping/Clapping (105).mp4  
  inflating: ./data/Human Activity Recognition - Video Dataset/Clapping/Clapping (106).mp4  
  inflating: ./data/Human Activity Recognition - Video Dataset/Clapping/Clapping (107).mp4  
  inflating: ./dat

In [None]:
dataset_path = "/content/data/Human Activity Recognition - Video Dataset"
processor = AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-k400")

# Create dataset
full_dataset = HARDataset(dataset_path, processor, num_frames=8)

# Split dataset (70% train, 30% test)
train_size = int(0.7 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

print(f"Dataset Loaded: {len(train_dataset)} Train, {len(test_dataset)} Test")

Dataset Loaded: 779 Train, 334 Test


In [None]:
model = TimesformerForVideoClassification.from_pretrained(
    "facebook/timesformer-base-finetuned-k400",
    num_labels=len(full_dataset.classes), ignore_mismatched_sizes=True
)

# Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



config.json:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/486M [00:00<?, ?B/s]

Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TimesformerForVideoClassification(
  (timesformer): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)
scaler = torch.amp.GradScaler("cuda")  # For mixed precision training

num_epochs = 3  # Change for longer training

best_accuracy = 0.0  # Track the best accuracy

#track metrics
train_losses = []
test_accuracies = []
all_preds = []
all_labels = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_idx, (pixel_values, labels) in enumerate(train_loader):
        pixel_values, labels = pixel_values.to(device), labels.to(device)

        optimizer.zero_grad()

        with torch.amp.autocast("cuda"):  # Mixed precision forward pass
            outputs = model(pixel_values)
            loss = criterion(outputs.logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

        if batch_idx % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}] Completed - Avg Loss: {avg_train_loss:.4f}")

    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for pixel_values, labels in test_loader:
            pixel_values, labels = pixel_values.to(device), labels.to(device)

            outputs = model(pixel_values)
            _, predicted = torch.max(outputs.logits, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), "best_timesformer_har.pth")
        print(f"New Best Model Saved! Accuracy: {best_accuracy:.2f}%")



Epoch [1/3], Batch [0/195], Loss: 1.8091
Epoch [1/3], Batch [10/195], Loss: 2.1582
Epoch [1/3], Batch [20/195], Loss: 2.0322
Epoch [1/3], Batch [30/195], Loss: 0.7627
Epoch [1/3], Batch [40/195], Loss: 0.6760
Epoch [1/3], Batch [50/195], Loss: 1.0571
Epoch [1/3], Batch [60/195], Loss: 0.8249
Epoch [1/3], Batch [70/195], Loss: 0.5592
Epoch [1/3], Batch [80/195], Loss: 0.5881
Epoch [1/3], Batch [90/195], Loss: 0.3520
Epoch [1/3], Batch [100/195], Loss: 0.6747
Epoch [1/3], Batch [110/195], Loss: 0.2517
Epoch [1/3], Batch [120/195], Loss: 0.2968
Epoch [1/3], Batch [130/195], Loss: 0.1834
Epoch [1/3], Batch [140/195], Loss: 0.0708
Epoch [1/3], Batch [150/195], Loss: 0.1503
Epoch [1/3], Batch [160/195], Loss: 0.0921
Epoch [1/3], Batch [170/195], Loss: 0.1650
Epoch [1/3], Batch [180/195], Loss: 0.0769
Epoch [1/3], Batch [190/195], Loss: 0.1080
Epoch [1/3] Completed - Avg Loss: 0.6572
Test Accuracy: 99.10%
New Best Model Saved! Accuracy: 99.10%
Epoch [2/3], Batch [0/195], Loss: 0.0965
Epoch [2

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for pixel_values, labels in test_loader:
        pixel_values, labels = pixel_values.to(device), labels.to(device)

        outputs = model(pixel_values)
        _, predicted = torch.max(outputs.logits, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

#traning loss curve

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs+1), train_losses, marker='o', label="train loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training loss v/s epochs")
plt.legend()

#testing loss curve
plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs+1), test_accuracies, marker='s', label="Test Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy percentage")
plt.title("Test Accuracy v/s epochs")
plt.legend()
plt.grid()

plt.show()

In [None]:
#confusion matrix

class_names = full_dataset.classes
cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d",cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted")
plt.ylable("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Precision, Recall, F1 score per class

precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average=None)

plt.figure(figsize=(10, 5))
x = range(len(class_names))
plt.bar(x, precision, width=0.3, label="Precision")
plt.bar([i + 0.3 for i in x], recall, width=0.3, label="Recall")
plt.bar([i + 0.6 for i in x], f1, width=0.3, label="F1-Score")
plt.xticks([i + 0.3 for i in x], class_names, rotation=45)
plt.ylabel("Score")
plt.title("Precision, Recall, F1-Score Per Class")
plt.legend()
plt.show()

In [None]:
torch.save(model.state_dict(), "timesformer_har.pth")
print("Model saved successfully!")


Model saved successfully!


In [None]:
import torch.onnx

# Dummy input with correct shape (batch_size=1, num_frames=8, C=3, H=224, W=224)
dummy_input = torch.randn(1, 8, 3, 224, 224).to(device)

# Export model to ONNX
torch.onnx.export(
    model,
    dummy_input,
    "best_timesformer_har.onnx",
    export_params=True,
    opset_version=12,
    do_constant_folding=True,
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={           # Allow variable batch size
        "input": {0: "batch_size"},
        "output": {0: "batch_size"}
    }
)

print("Model exported as ONNX successfully!")

  if embeddings.size(1) != self.position_embeddings.size(1):
  if num_frames != self.time_embeddings.size(1):


Model exported as ONNX successfully!
