In [None]:
import sys
sys.path.append('../')

import cv2
import os
import multiprocessing
import math
import json

import random
import pandas as pd
import numpy as np
import torch

import albumentations as A
from albumentations.pytorch import ToTensorV2
from pathlib import Path
from tqdm.auto import tqdm
from dataclasses import dataclass
from collections import defaultdict
from torchvision import transforms
from scipy.spatial.distance import cdist
from transformers import pipeline

from sklearn.cluster import KMeans
from sklearn.metrics import (silhouette_score,
                             davies_bouldin_score)
from MulticoreTSNE import MulticoreTSNE as TSNE
from keybert import KeyBERT

from src.model import (get_model,
                       generate_video_embeddings)
from src.dataset import (VideoFrameDataset,
                         load_MSRVTT_dataset)
from src.visualization import (visualize_random_files,
                               get_numpy_images,
                               visualize_frames,
                               plot_clusters,
                               plot_count_plot)
from src.utils import (get_video_infos,
                       show_dataset_stats,
                       read_embeddings_and_aggregate)

## Config

In [None]:
@dataclass
class Config:
    random_seed: int = 28
    vis_n_samples: int = 9
    vis_n_cols: int = 3
    vis_width: int = 320
    img_h: int = 224
    img_w: int = 224
    frame_interval: int = 10
    norm_mean = [0.48145466, 0.4578275, 0.40821073]
    norm_std = [0.26862954, 0.26130258, 0.27577711]
    model_type = 'openclip_b32'
    batch_size: int = 128
    n_workers: int = multiprocessing.cpu_count()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    agg_strategy: str = 'mean'
    n_clusters: int = 20
    tsne_n_components: int = 2
    summarizer_max_length: int = 100
    summarizer_min_length: int = 20
    kw_k_nearest: int = 20
    kw_score_threshold: float = 0.2
    kw_top_n: int = 10

config = Config()
random.seed(config.random_seed)

## Load dataset

In [None]:
# Dataset was downloaded from https://cove.thecvf.com/datasets/839
dataset_path = Path('/datasets/video_clustering/MSRVTT/')
msrvtt_dataset = load_MSRVTT_dataset(dataset_path)

save_dir = dataset_path / 'results'
save_dir.mkdir(exist_ok=True)

# EDA

## Dataset analysis

### Train set

In [None]:
train_video_infos = get_video_infos(msrvtt_dataset.train_video_paths)
show_dataset_stats(train_video_infos)

### Test set

In [None]:
test_video_infos = get_video_infos(msrvtt_dataset.test_video_paths)
show_dataset_stats(test_video_infos)

## Samples visualization

In [None]:
visualize_random_files(
    video_paths=msrvtt_dataset.train_video_paths,
    video_annos=msrvtt_dataset.train_annos,
    n_samples=config.vis_n_samples,
    n_cols=config.vis_n_cols,
    width=config.vis_width
)

# Embeddings generation

Each frame transform function

In [None]:
transform = A.Compose([
    A.Resize(
        height=config.img_h, 
        width=config.img_w
    ),
    A.Normalize(
        mean=config.norm_mean,
        std=config.norm_std,
    ),
    ToTensorV2()
])

Here is an example of how a video is splitted into individual frames at a certain frame interval and a transformation is performed on each frame (Here I applied only resizing and normalization)

In [None]:
video_path = msrvtt_dataset.train_video_paths[28]
video_name = video_path.name
video_info = train_video_infos[video_name]
print(video_info)

video_dataset = VideoFrameDataset(
    video_path=video_path, 
    frame_interval=config.frame_interval, 
    transform=transform,
)
print('Single video torch dataset len: ', len(video_dataset))

frames_np = get_numpy_images(video_dataset.frames)

visualize_frames(
    frames_np,
    n_cols=5,
)

## Embeddings generation example

I decided to use CLIP ViT-B-32 vision encoder model from openclip to generate frame embeddings and after that aggregate frame embeddings to get single feature vector for each video.

I generated embeddings for whole train dataset using **generate_embeddings.py** script, because it takes several hours that's why it is not convenient to do this in a jupyter notebook. Here I show the embedding generation for only one video. I saved embeddings for each video file, preserving temporary information. I did this to have the ability to change the frame embeddings' aggregation strategy without needing to recompute the embeddings.

In [None]:
model = get_model(
    model_type=config.model_type,
    device=config.device
)

In [None]:
for video_path in tqdm(msrvtt_dataset.train_video_paths):
    video_embeddings, video_frame_indexes = generate_video_embeddings(
        video_path,
        model,
        transform,
        batch_size=config.batch_size,
        n_workers=config.n_workers,
        frame_interval=config.frame_interval,
        device=config.device
    )
    print('Embeddings shape: ', video_embeddings.shape)
    print('Frame indexes shape: ', video_frame_indexes.shape) 
    break

# Load computed embeddings

Pre-calculated embeddings for videos are loaded here and then they are aggregated along the time dimension. 

In [None]:
embeddings_dir = dataset_path / 'embeddings'
embeddings_paths_list = list(embeddings_dir.glob('*.npz'))
print('Number of embeddings files: ', len(embeddings_paths_list))

all_embeddings = []
all_video_names = []

for embeddings_file_path in tqdm(embeddings_paths_list):
    video_name = embeddings_file_path.stem + '.mp4'
    embeddings = read_embeddings_and_aggregate(
        embeddings_file_path,
        agg_strategy=config.agg_strategy
    )
    all_embeddings.append(embeddings)
    all_video_names.append(video_name)

all_embeddings = np.concatenate(all_embeddings)
all_video_names = np.array(all_video_names)
print('All embeddings array shape: ', all_embeddings.shape)

# Clustering

I decided to choose number of clusters equal to 20 because the paper describing the dataset says:

*"MSR-VTT provides 10K web video clips with 41.2 hours and 200K clip-sentence pairs in total, covering a comprehensive list of 20 categories and a wide variety of video content."*

However, it is easy to conduct a study based on clustering metrics (silhouette score ,Davies-Bouldin index etc) to select the optimal number of clusters.

In [None]:
kmeans = KMeans(
    n_clusters=config.n_clusters, 
    random_state=config.random_seed, 
    n_init="auto"
).fit(all_embeddings)

labels = kmeans.labels_
cluster_centers = kmeans.cluster_centers_

silhouette_avg = silhouette_score(all_embeddings, kmeans.labels_)
print('Average silhouette score: ', silhouette_avg)

davies_bouldin_avg = davies_bouldin_score(all_embeddings, kmeans.labels_)
print('Average Davies-Bouldin score: ', davies_bouldin_avg)

distances = cdist(
    all_embeddings, 
    cluster_centers, 
    metric='euclidean'
)

result_dict = dict(
    video_id=all_video_names, 
    cluster_id=labels
)
df = pd.DataFrame(result_dict) 
df.to_csv(save_dir/'video_cluster_mapping.csv', index=False)

# Cluster description generation

Idea behind cluster descriptions generation is following:

1) Select "kw_k_nearest" number of cluster samples nearest to the cluster center
2) Each selected video has 20 text captions, let's make only one using summarization
3) Group summarized video captions into one sentence and make keywords extraction using KeyBERT
4) Filter resulting keywords based on score threshold

In [None]:
summarizer = pipeline(
    "summarization", 
    device=config.device,
    model="facebook/bart-large-cnn"
)

kw_model = KeyBERT()

In [None]:
def geterate_cluster_tags(
    kw_model,
    summarizer,
    video_names,
    video_annotations,
    labels,
    kw_k_nearest=20,
    kw_top_n=10,
    kw_score_threshold=0.2,
    summarizer_max_length=100,
    summarizer_min_length=20
):
    cluster_keywords = {}
    n_clusters = len(np.unique(labels))
    for i in tqdm(range(n_clusters)):
        cluster_indices = np.where(labels == i)[0]
        cluster_distances = distances[cluster_indices, i]
        nearest_indices = np.argsort(cluster_distances)[:kw_k_nearest]
        nearest_sample_indices = [cluster_indices[idx] for idx in nearest_indices]
        nearest_video_names = video_names[nearest_sample_indices]
        
        video_summaries = []
        for video_name in nearest_video_names:
            video_combined_caption = '\n'.join(video_annotations[video_name])
            video_sumary = summarizer(
                video_combined_caption, 
                max_length=summarizer_max_length, 
                min_length=summarizer_min_length, 
                do_sample=False
            )[0]['summary_text']
            video_summaries.append(video_sumary)
        
        cluster_combined_caption = '\n'.join(video_summaries)    
        keywords = kw_model.extract_keywords(
            cluster_combined_caption,
            top_n=kw_top_n,
            keyphrase_ngram_range=(1, 2)
        )

        cluster_keywords[i] = [kw for kw, kw_score in keywords if kw_score>=kw_score_threshold]
        print(f'Cluster {i} keywords: ', cluster_keywords[i])
        
    return cluster_keywords

In [None]:
video_annotations = msrvtt_dataset.train_annos

cluster_keywords = geterate_cluster_tags(
    kw_model,
    summarizer,
    video_names=all_video_names,
    video_annotations=video_annotations,
    labels=labels,
    kw_k_nearest=config.kw_k_nearest,
    kw_top_n=config.kw_top_n,
    kw_score_threshold=config.kw_score_threshold,
    summarizer_max_length=config.summarizer_max_length,
    summarizer_min_length=config.summarizer_min_length
)

In [None]:
with open(save_dir / 'cluster_descriptions.json', 'w') as fp:
    json.dump(cluster_keywords, fp)

# Visualization

In order to visually assess the quality of our clusters, let's project our embeddings onto a two-dimensional space usin TSNE.

In [None]:
tsne = TSNE(
    n_components=2, 
    # perplexity=50,
    n_jobs=config.n_workers,
    random_state=config.random_seed
)

embeddings_to_project = np.concatenate([
    all_embeddings,
    cluster_centers
])

embeddings_projected = tsne.fit_transform(embeddings_to_project)
embeddings_projected_videos = embeddings_projected[:-config.n_clusters, :]
embeddings_projected_cluster_centers = embeddings_projected[-config.n_clusters:, :]

print('projected embeddings shape: ', embeddings_projected_videos.shape)
print('projected cluster centers shape: ', embeddings_projected_cluster_centers.shape)

## Clusters TSNE plot

Let's visualize our 2D embedding projections

In [None]:
plot_clusters(
    embeddings_projected_videos,
    embeddings_projected_cluster_centers,
    labels,
    video_names=all_video_names,
    annotations=msrvtt_dataset.train_annos,
    show_only_first_annotation=True,
    cluster_keywords=cluster_keywords,
    save_dir=save_dir
)

## Clusters count plot

Let's also look at the distribution of the number of videos in each cluster.

In [None]:
plot_count_plot(labels)

# Example videos from cluster

In [None]:
cluster_id = 0
label_mask = np.array(labels) == cluster_id
cluster_video_names = all_video_names[label_mask]
cluster_video_paths = [dataset_path / 'videos/all/' / video_name for video_name in cluster_video_names]

print('Cluster keywords: ', cluster_keywords[cluster_id])

visualize_random_files(
    video_paths=cluster_video_paths,
    video_annos=msrvtt_dataset.train_annos,
    n_samples=config.vis_n_samples,
    n_cols=config.vis_n_cols,
    width=config.vis_width
)