In [1]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
import numpy as np
import cv2
from numpy import random
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torchvision
# from script.video_data_loader import *
import time
import warnings
import torch
from torchvision.transforms import v2
warnings.filterwarnings("ignore")
from torchvision.transforms import InterpolationMode
# from script.anomaly_loss import SupConLoss
import torch
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
# from script.get_model import *
import torch.nn as nn
# from script.data_augmentation import *

In [2]:
from torchvision.io import read_video
import torch

# Path to your video file
video_path = r"C:\Users\wajah\Downloads\test1.mp4"

# Read video (returns frames, audio, and metadata)
video_frames, _, _ = read_video(video_path, pts_unit='sec')  # video_frames: (T, H, W, C)

# Convert to (C, T, H, W)
video_tensor = video_frames.permute(0,3,1,2).unsqueeze(0)
print(video_tensor.shape)

transforms = v2.Compose([
        v2.Resize(size=(256)),
        v2.CenterCrop(size=(224, 224)),
        v2.UniformTemporalSubsample(num_samples=64),
        v2.ToDtype(torch.float32, scale=True),
        v2.Normalize(mean=[0.45,0.45,0.45], std=[0.225,0.225,0.225]),
    ])



torch.Size([1, 300, 3, 320, 568])


In [3]:
def show_images(batch_tensor, num_images=2):
    """
    Visualizes images from a 4D tensor using matplotlib.

    Args:
        batch_tensor (torch.Tensor): 4D tensor of shape (N, C, H, W).
        num_images (int): Number of images to display.
    """
    # Ensure the number of images to display does not exceed the batch size
    num_images = min(num_images, batch_tensor.shape[0])

    # Create a grid of subplots
    fig, axs = plt.subplots(1, num_images, figsize=(12, 6))

    # If only one image, wrap it in a list for uniform handling
    if num_images == 1:
        axs = [axs]

    for i in range(num_images):
        # Convert the image from (C, H, W) to (H, W, C) and normalize
        image = batch_tensor[i].permute(1, 2, 0).numpy()

        # Plot the image
        axs[i].imshow(image)
        axs[i].axis('off')  # Turn off axis
        axs[i].set_title(f"Image {i+1}")

    plt.show()

In [None]:
import torch
import torch.nn as nn
import torchvision
import numpy as np
import cv2
from torchvision.transforms.functional import to_pil_image

class MViTSlidingClassifier(nn.Module):
    def __init__(self, num_classes=2, pretrained=True):
        super(MViTSlidingClassifier, self).__init__()
        self.model = torchvision.models.video.mvit_v2_s(weights="DEFAULT" if pretrained else None)

        self.gradients = []
        self.activations = []

        # Hook for Grad-CAM
        target_layer = self.model.blocks[-1].norm1 # change scale here -1 for last sclae 8,7,7 and -2 for 8,14,14

        target_layer.register_forward_hook(self._save_activation)
        target_layer.register_full_backward_hook(self._save_gradient)

    def _save_activation(self, module, input, output):
        self.activations.append(output.detach())

    def _save_gradient(self, module, grad_input, grad_output):
        self.gradients.append(grad_output[0].detach())

    def forward(self, x, return_cam=False, target_class=None):
        B, C, T, H, W = x.shape
        assert T == 64, "Expected 64 frames"
        assert B == 1, "Grad-CAM currently supports B=1"

        self.activations.clear()
        self.gradients.clear()
        features = []

        # 4 sliding clips of 16 frames
        # here we use the 4 clips per video contaning 16 frames per clip you can change aas per your requirement 
        for i in range(0, 64, 16):
            clip = x[:, :, i:i+16]  # shape: (1, 3, 16, H, W)
            feat = self.model(clip)  # (1, 768)
            features.append(feat)

        features = torch.stack(features, dim=1)
        logits = features.mean(dim=1)
        # logits = self.classifier(combined)

        if not return_cam:
            return logits

        if target_class is None:
            target_class = logits.argmax(dim=1)

        self.zero_grad()
        logits[0, target_class.item()].backward(retain_graph=True)

        cams = []
        for grad, act in zip(self.gradients, self.activations):  # (1, 393, 768)
            weights = grad.mean(dim=1)[0]               # (768,)
            activation = act[0]                         # (393, 768)
            cam = (activation * weights).sum(dim=1)     # (393,)
            cam = cam.relu()
            cam = cam / (cam.max() + 1e-8)
            cams.append(cam.cpu())

        return logits, cams  # list of 4 CAMs (each 393, includes CLS)

In [5]:
def overlay_cam_on_frame_all(cam_list, frame_tensor, patch_shape=(8, 7, 7), alpha=0.5):
    """
    Args:
        cam_list (list of Tensors): List of 4 CAM tensors, each of shape (393,)
        frame_tensor (Tensor): (1, 3, 64, 224, 224), normalized video tensor
        patch_shape (tuple): (T, H_p, W_p) = (8, 7, 7)
        alpha (float): Blending factor

    Returns:
        list of np.ndarray: List of 32 overlaid frames (every 2nd frame from all 4 windows)
    """
    def unnormalize(tensor, mean, std):
        for t, m, s in zip(tensor, mean, std):
            t.mul_(s).add_(m)
        return tensor

    overlaid_frames = []
    frame_tensor = frame_tensor.squeeze(0)  # shape: (3, 64, H, W)
    frame_tensor = unnormalize(frame_tensor.clone(), mean=[0.45]*3, std=[0.225]*3)

    for w_idx, cam in enumerate(cam_list):  # 4 windows
        cam = cam[1:]  # Remove CLS token
        T, H_p, W_p = patch_shape
        cam_reshaped = cam.reshape(T, H_p, W_p)

        frame_start = w_idx * 16  # Start frame for this window

        for i in range(0, 16, 2):  # every 2nd frame in 16-frame window
            frame_idx = frame_start + i
            if frame_idx >= 64:
                continue  # safeguard

            frame = frame_tensor[:, frame_idx, :, :]  # (3, H, W)
            cam_resized = cv2.resize(cam_reshaped[i // 2].cpu().numpy(), (frame.shape[1], frame.shape[2]))
            cam_resized = (cam_resized - cam_resized.min()) / (cam_resized.max() + 1e-6)

            heatmap = cv2.applyColorMap(np.uint8(255 * cam_resized), cv2.COLORMAP_JET)
            heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)

            frame_np = np.array(to_pil_image(frame.cpu()))
            overlay = cv2.addWeighted(frame_np, 1 - alpha, heatmap, alpha, 0)
            overlaid_frames.append(overlay)

    return overlaid_frames


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MViTSlidingClassifier().cuda()
model.eval()

MViTSlidingClassifier(
  (model): MViT(
    (conv_proj): Conv3d(3, 96, kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3))
    (pos_encoding): PositionalEncoding()
    (blocks): ModuleList(
      (0): MultiscaleBlock(
        (norm1): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
        (norm2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
        (attn): MultiscaleAttention(
          (qkv): Linear(in_features=96, out_features=288, bias=True)
          (project): Sequential(
            (0): Linear(in_features=96, out_features=96, bias=True)
          )
          (pool_q): Pool(
            (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=96, bias=False)
            (norm_act): Sequential(
              (0): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
            )
          )
          (pool_k): Pool(
            (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 8, 8), padding=(1, 1, 1), groups=96, bias=

In [7]:
import matplotlib.pyplot as plt
import torchvision.transforms as T
import torch.nn as nn
import gc


input_tensor = torch.tensor(transforms(video_tensor)).permute(0, 2, 1, 3,4).cuda()
logits, cams = model(input_tensor, return_cam=True)

print("Predicted class:", logits.argmax(dim=1))
print("CAM shape per window:", [c.shape for c in cams])


Predicted class: tensor([20], device='cuda:0')
CAM shape per window: [torch.Size([1569]), torch.Size([1569]), torch.Size([1569]), torch.Size([1569])]


In [None]:
import imageio
import os

# Create output folder
os.makedirs("gradcam_gif", exist_ok=True)

# List to hold overlay frames
overlay_frames = []

overlay_imgs  = overlay_cam_on_frame_all(cams, input_tensor, patch_shape=(8, 14, 14)) # chnage the patch shape as per scale 
print(overlay_imgs)

# # Loop over 4 windows × 8 frames = 32 frames
# for w in range(4):
#     cam = cams[w].squeeze()       # (393,)
#     cam_wo_cls = cam[1:]          # (392,)
#     # cam_volume = cam_wo_cls.reshape(8, 7, 7)  # (8, 7, 7)
#     overlay_img = overlay_cam_on_frame(cam, input_tensor,patch_shape=(8, 14, 14))
#     for f in range(8):
#         overlay_frames.append(overlay_img[f])

for overlay_img in overlay_imgs:
    overlay_frames.append(overlay_img)


# Save as animated GIF
gif_path = "gradcam_gif/mvit_gradcam2.gif"
imageio.mimsave(gif_path, overlay_frames, fps=4)  # adjust fps as needed
print(f"✅ GIF saved at: {gif_path}")


[array([[[ 58,  54, 117],
        [ 59,  54, 118],
        [ 60,  54, 118],
        ...,
        [ 12,  12, 110],
        [ 12,  12, 110],
        [ 12,  12, 110]],

       [[ 41,  36, 100],
        [ 42,  38, 101],
        [ 44,  38, 102],
        ...,
        [ 13,  12, 110],
        [ 13,  12, 110],
        [ 13,  12, 110]],

       [[ 38,  32,  96],
        [ 37,  31,  95],
        [ 36,  30,  94],
        ...,
        [ 13,  12, 110],
        [ 13,  12, 110],
        [ 13,  12, 110]],

       ...,

       [[ 42,  43, 170],
        [ 44,  44, 172],
        [ 43,  44, 172],
        ...,
        [ 35,  74, 178],
        [ 34,  74, 178],
        [ 34,  74, 178]],

       [[ 41,  42, 168],
        [ 42,  42, 170],
        [ 42,  44, 172],
        ...,
        [ 30,  68, 174],
        [ 34,  72, 177],
        [ 35,  74, 179]],

       [[ 42,  44, 170],
        [ 42,  44, 170],
        [ 49,  52, 178],
        ...,
        [ 26,  64, 170],
        [ 30,  68, 174],
        [ 34,  71, 178]