In [1]:
%matplotlib inline

import math
import json
import glob

import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Allow for processing videos, and images of different input sizes

In [7]:
# Load a pre-trained ViT model
size = "small"
device = "cuda" if torch.cuda.is_available() else "cpu"

backbones = {
    "small": "vits14_reg",
    "base": "vitb14_reg",
    "large": "vitl14_reg",
    "giant": "vitg14_reg",
}

# We will use this encoder in the rest of the notebook
encoder = torch.hub.load(repo_or_dir="facebookresearch/dinov2", model=f"dinov2_{backbones[size]}").to(device)

Using cache found in /home/rob/.cache/torch/hub/facebookresearch_dinov2_main


In [None]:
# Load an example video
def extract_frames(video_path) -> list[np.ndarray]:
    video = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = video.read()
        if not ret:
            break
        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    video.release()
    return frames

def closest_multiple(val, multiple):
    return int(multiple * round(val / multiple))
    
# Preprocess all the input image(s)
def transform(img, patch_size):
    height, width, _ = img.shape

    # Conform to the patch size of the ViT
    div_height = closest_multiple(height, patch_size)
    div_width = closest_multiple(width, patch_size)
    img = cv2.resize(img, (div_width, div_height))

    # Determine the 0 padding for non-square input
    height, width, _ = img.shape
    max_dim = max(height, width)
    
    # Calculate padding for each side
    top = (max_dim - height) // 2
    bot = max_dim - height - top
    left = (max_dim - width) // 2
    right = max_dim - width - left
    img = cv2.copyMakeBorder(img, top, bot, left, right, cv2.BORDER_CONSTANT, value=[0, 0, 0])
    
    # Normalize the input
    img = (img - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])
    img /= 255

    # Move the channel dim
    img = img.transpose(2, 0, 1)
    return img, (div_height, div_width)
    
file_path = "koala.mp4"
frames = extract_frames(file_path)

original_height, original_width, _ = frames[0].shape
print(original_height, original_width)

# Preprocess all the frames
processed_frames = [transform(frame, encoder.patch_size)[0] for frame in frames]
frame_dim = transform(frames[0], encoder.patch_size)[1]

# To resize the patches with
frame_dim

720 1280


In [None]:
def visualize_features(features, encoder, frame_dim, seperate_foreground):
    patches = int(features.shape[1]**0.5)
    features = features.reshape(1, patches, patches, -1)

    # Crop the feature patches if the input was not square
    start_x, end_x = 0, patches
    start_y, end_y = 0, patches
    if frame_dim[0] > frame_dim[1]:
        # first dim x
        excess_patches = (frame_dim[0] - frame_dim[1]) / encoder.patch_size
        excess_patch = math.ceil(excess_patches / 2) 
        start_x, end_x = int(excess_patch), int(patches - excess_patch)
    elif frame_dim[0] < frame_dim[1]:
        # second dim y
        excess_patches = (frame_dim[1] - frame_dim[0]) / encoder.patch_size
        excess_patch = math.ceil(excess_patches / 2)
        start_y, end_y = int(excess_patch), int(patches - excess_patch)
    cropped_features = features[:, start_y:end_y, start_x:end_x]

    # Continue with cropped features
    
    patch_features = cropped_features.reshape((end_x - start_x) * (end_y - start_y), -1)
    
    # Apply PCA and MinMaxScaler
    pca = PCA(n_components=3)
    scaler = MinMaxScaler(clip=True)
    pca.fit(patch_features)
    pca_features = pca.transform(patch_features)
    scaler.fit(pca_features)
    pca_features = scaler.transform(pca_features)

    # Separate background and foreground
    if seperate_foreground:
        pca_background = pca_features[:, 0] > threshold
        pca_foreground = ~pca_background
    
        # Refit PCA for foreground
        pca.fit(patch_features[pca_foreground])
        pca_features_rem = pca.transform(patch_features[pca_foreground])
        scaler.fit(pca_features_rem)
        pca_features_rem = scaler.transform(pca_features_rem)

        pca_features_rgb = np.zeros(((end_x - start_x) * (end_y - start_y), 3))
        pca_features_rgb[pca_background] = 0
        pca_features_rgb[pca_foreground] = pca_features_rem
        pca_features = pca_features_rgb.reshape((end_y - start_y), (end_x - start_x), 3)

    # Convert the PCA features to an image
    pca_features = pca_features.reshape((end_y - start_y), (end_x - start_x), 3)
    return pca_features

def visualize_frames(
    frames,
    encoder,
    image_dim=(),
    frame_dim=(),
    seperate_foreground=True
):
    output_frames = []
    for frame in frames:
        x = torch.tensor(frame).to(device).unsqueeze(0).float()
        
        # Extract features using the ViT model
        features_dict = encoder.forward_features(x)
        features = features_dict["x_norm_patchtokens"].detach().cpu().numpy()
        
        feature_frame = visualize_features(
            features,
            encoder,
            frame_dim,
            seperate_foreground
        )

        # Reshape to the original frame size
        feature_frame = (feature_frame * 255).astype(np.uint8)
        # Important, cv2 switches location of width and height
        height, width = image_dim 
        feature_frame = cv2.resize(feature_frame, (width, height))
        output_frames.append(feature_frame)
    return output_frames

output_frames = visualize_frames(
    processed_frames,
    encoder,
    (original_height, original_width),
    frame_dim,
    seperate_foreground=False
)

In [None]:
# Now convert the processed frames to video
# For this I used an extra package.
from moviepy.editor import ImageSequenceClip
clip = ImageSequenceClip(output_frames, fps=20)
clip.write_videofile('output.mp4')