In [72]:
import glob
import math

# warnings.simplefilter(action='ignore')
import os
import shutil
# import warnings
from pathlib import Path
from typing import Any, List, Optional, Tuple, Union
from uuid import uuid4

# import av
import cv2
import librosa
import numpy as np
import torch
from moviepy import editor
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from PIL import Image
from pyAudioAnalysis import MidTermFeatures as mtf
from pyAudioAnalysis import audioTrainTest as at
from pydub import AudioSegment, silence
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics.pairwise import cosine_distances
from sklearn.neighbors import LocalOutlierFactor
from torchvision import models, transforms
from yt_dlp import YoutubeDL

# from pytorchvideo.data.encoded_video import EncodedVideo


np.random.seed(0)

In [73]:
save_videos_to = "data"
window = 10
summary_output = "videos_summary"
num_highlights = 10

## **Helper Functions**

In [74]:
class ImageHighlightsFinder:
    def __init__(self, batch_size: int = 32) -> None:
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.batch_size = batch_size
        model = models.resnet18(pretrained=True)
        self.model = torch.nn.Sequential(*(list(model.children())[:-1])).to(self.device)
        self.model.eval()

    def _get_transformations(self, will_be_saved: bool) -> List[Any]:
        transformations = [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
        if will_be_saved:
            transformations.append(transforms.ToPILImage())
        return transformations

    def _preprocess_image(
        self, image: Image.Image, will_be_saved: bool = False
    ) -> Union[torch.Tensor, Image.Image]:
        transformations = self._get_transformations(will_be_saved=will_be_saved)

        transform = transforms.Compose(transformations)
        image = transform(image)
        if will_be_saved:
            return image

        image = image.unsqueeze(0)
        # print(image.shape)
        return image

    def _chunks(self, lst, n):
        """
        Yield successive n-sized chunks from lst.
        """
        for i in range(0, len(lst), n):
            yield lst[i : i + n]

    def _create_feature_vectors(self, file_paths: List[str]) -> np.ndarray:
        features = None
        for file_paths_chunk in self._chunks(file_paths, n=self.batch_size):
            # Get the data for this batch.
            imgs = [Image.open(img).convert("RGB") for img in file_paths_chunk]
            imgs = [self._preprocess_image(img) for img in imgs]
            imgs = torch.cat(imgs, dim=0).to(self.device)

            # Convert them to features.
            with torch.no_grad():
                f = self.model(imgs)

            if features is None:
                features = f.clone()
            else:
                features = torch.cat((features, f), dim=0)

        features = features.squeeze(dim=-1)
        features = features.squeeze(dim=-1)
        features = features.cpu().detach().numpy()
        return features

    def _extract_frames(
        self,
        video_path: str,
        images_folder: str,
        start_at_sec: int = 5,
        window: int = 10,
    ):
        os.makedirs(images_folder, exist_ok=True)
        vidcap = cv2.VideoCapture(video_path)
        success, image = vidcap.read()
        success = True
        while success:
            vidcap.set(
                cv2.CAP_PROP_POS_MSEC, (start_at_sec * 1000)
            )  # One frame per second.
            success, image = vidcap.read()
            # print("Read a new frame: ", success)
            if success:
                cv2.imwrite(
                    os.path.join(images_folder, f"sec_{start_at_sec}.jpg"), image
                )  # save frame as JPEG file.
            start_at_sec += window

    def get_distances(
        self,
        video_path: str,
        temp_path: str,
        window: int = 10,
    ) -> np.ndarray:
        temp_file_path = os.path.join(temp_path, "image")
        start_at_sec = int(window * 0.5)
        self._extract_frames(
            video_path=video_path,
            images_folder=temp_file_path,
            start_at_sec=start_at_sec,
            window=window,
        )
        file_paths = glob.glob(os.path.join(temp_file_path, "*.jpg"))
        features = self._create_feature_vectors(file_paths=file_paths)
        distances = cosine_distances(features, features)
        del features
        median_distances = np.median(distances, axis=1)
        del distances
        assert median_distances.shape[0] == len(file_paths)
        shutil.rmtree(temp_file_path, ignore_errors=True)
        return median_distances

In [75]:
class AudioHighlightsFinder:
    def __init__(self) -> None:
        pass

    def _audio_seg(
        self, path: str, output_path: str, window: int = 10
    ) -> Tuple[List[int], List[str]]:
        is_silence = []
        file_names = []
        os.makedirs(output_path, exist_ok=True)
        duration = librosa.get_duration(path=path)
        song = AudioSegment.from_file(path, format="mp3")
        window_ms = 1000 * window
        for j in range(1, math.floor(duration / window) + 1):
            start_ = (j - 1) * window_ms
            end_ = j * window_ms
            seg_ = song[start_:end_]
            seg_name = f"sec_{int(start_ / 1000)}_{int(end_ / 1000)}.mp3"
            seg_.export(
                os.path.join(output_path, seg_name),
                format="mp3",
            )
            file_names.append(seg_name)
            # Is it silence?
            dBFS = seg_.dBFS
            silence_results = silence.detect_silence(
                seg_, min_silence_len=1000, silence_thresh=dBFS - 16
            )
            total_silence = [(stop - start) / 1000 for start, stop in silence_results]
            total_silence = np.sum(total_silence)
            if total_silence >= 0.5 * window:
                is_silence.append(1)
            else:
                is_silence.append(0)
        return is_silence, file_names

    def _feature_extraction(self, directory: str) -> Tuple[np.ndarray, List[str]]:
        f1, _, feature_names = mtf.directory_feature_extraction(
            directory, 1, 1, 0.1, 0.1
        )
        mid_term_features = [f1]
        # convert list of feature matrices to x, y format:
        x, y = at.features_to_matrix(mid_term_features)
        m = x.mean(axis=0)
        s = np.std(x, axis=0)
        X = (x - m) / s
        return X, feature_names

    def _feature_selection(
        self, X: np.ndarray, feature_names: List[str]
    ) -> Tuple[np.ndarray, List[str]]:
        # Choose Features that have some variability
        threshold = 1
        selector = VarianceThreshold(threshold=threshold)
        X_selected = selector.fit_transform(X)
        selected_feature_indices = selector.get_support(indices=True)
        selected_feature_names = [feature_names[i] for i in selected_feature_indices]
        return X_selected, selected_feature_names

    def _outlier_detection(self, X: np.ndarray, num_high: int):
        clf = LocalOutlierFactor(n_neighbors=20, metric="cosine")
        clf.fit(X)
        outlier_scores = clf.negative_outlier_factor_
        sorted_indices = np.argsort(outlier_scores)
        highlight_indices = sorted_indices[:num_high]
        # print(outlier_detection(X_new, 10))
        return highlight_indices

    def get_distances(
        self,
        video_path: str,
        temp_path: str,
        window: int = 10,
    ) -> np.ndarray:
        temp_file_path = os.path.join(temp_path, "audio")
        # Load the Video
        video = editor.VideoFileClip(video_path)
        # Extract the Audio
        audio = video.audio
        # Export the Audio
        audio_path = os.path.join(temp_file_path, "all_audio")
        os.makedirs(audio_path, exist_ok=True)
        audio.write_audiofile(os.path.join(audio_path, "audio.mp3"))
        del video, audio

        is_silence, file_names = self._audio_seg(
            path=os.path.join(audio_path, "audio.mp3"),
            output_path=temp_file_path,
            window=window,
        )
        # Make sure that we sort the results in the same way as they will get processed.
        is_silence = [x for _, x in sorted(zip(file_names, is_silence), key=lambda pair: pair[0])]

        X, feature_names = self._feature_extraction(directory=temp_file_path)
        X, feature_names = self._feature_selection(X=X, feature_names=feature_names)

        distances = cosine_distances(X, X)
        del X
        median_distances = np.median(distances, axis=1)
        # Make distance equal to 0 for the silences to be ignored.
        median_distances = np.where(np.array(is_silence) == 1, 0, median_distances)

        # highlights = self._outlier_detection(X=X, num_high=num_highlights)

        shutil.rmtree(temp_file_path, ignore_errors=True)
        return median_distances

In [76]:
class HighlightsFinder:
    def __init__(self, batch_size: int = 32) -> None:
        self.ahf = AudioHighlightsFinder()
        self.ihf = ImageHighlightsFinder(batch_size=batch_size)
    
    def _str_to_int_tuple(self, s: str) -> Tuple[int, int]:
        start, end = s.split("_")
        start = int(start)
        end = int(end)
        return start, end

    def _merge_timestamps(self, timestamps: List[Tuple]) -> List[Tuple]:
        merged_timestamps = []
        timestamps.sort(key=lambda x: x[0])  # Sort the timestamps based on start time

        for timestamp in timestamps:
            if merged_timestamps and timestamp[0] == merged_timestamps[-1][1]:
                merged_timestamps[-1] = (
                    merged_timestamps[-1][0],
                    timestamp[1],
                )  # Extend the previous timestamp
            else:
                merged_timestamps.append(timestamp)  # Add a new timestamp

        return merged_timestamps

    def _convert_str_to_timestamps(self, highlights: List[str]) -> List[Tuple]:
        timestamps = [
            self._str_to_int_tuple(s=timestamp)
            for timestamp in highlights
        ]
        timestamps = self._merge_timestamps(timestamps=timestamps)
        return timestamps

    def create_video_summary(
        self, video_path: str, summary_output: str, num_highlights: int, window: int
    ):
        temp_file_dir = "temp"
        temp_file_path = os.path.join(temp_file_dir, str(uuid4()))

        # Get the distances from each modality.
        image_distances = self.ihf.get_distances(
            video_path=video_path, temp_path=temp_file_path, window=window
        )
        audio_distances = self.ahf.get_distances(
            video_path=video_path, temp_path=temp_file_path, window=window
        )
        assert image_distances.shape[0] == audio_distances.shape[0]

        # TODO: Add weight to each modality.
        distances = np.add(image_distances, audio_distances)

        # Get the idx of the segments with the greater distance.
        idx = np.argsort(distances)[-num_highlights:]

        # Load the video.
        video = editor.VideoFileClip(video_path)
        # Get the duration of the video in secs.
        duration = video.duration

        # Create the timestamps in the same way as they will get processed.
        timestamps = [
            f"{(j - 1) * window}_{j * window}"
            for j in range(1, math.floor(duration / window) + 1)
        ]
        timestamps = list(sorted(timestamps))

        # # Get the timestamps of the segments with the greater distance.
        highlights = np.array(timestamps)[idx].tolist()

        timestamps = self._convert_str_to_timestamps(highlights=highlights)
        
        # Create the summary video.
        clips = []
        for start_time , end_time in timestamps:
            clip = video.subclip(start_time, end_time)
            clips.append(clip)
        
        final = editor.concatenate_videoclips(clips)
        
        # Get video's name.
        video_name = os.path.basename(video_path)
        video_name = Path(video_name).stem
        os.makedirs(summary_output, exist_ok=True)
        final.write_videofile(os.path.join(summary_output, f"{video_name}_summary.mp4"))
        # Delete the temp dir.
        shutil.rmtree(temp_file_dir, ignore_errors=True)

## **Download Videos**

In [72]:
links = [
    # "https://www.youtube.com/watch?v=d0r0vzvqeoc&ab_channel=LubenTV",
    "https://www.youtube.com/watch?v=SvV6aUki6LU&list=PLCGIzmTE4d0iCqSmha1X7F-_AqB3jjo26&index=7&ab_channel=FIFA",
]

ydl_opts = {"noplaylist": True, "outtmpl": os.path.join(save_videos_to, "%(id)s"), "format": "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4] / bv*+ba/b"}

with YoutubeDL(ydl_opts) as ydl:
    ydl.download(links)

[youtube:tab] Extracting URL: https://www.youtube.com/watch?v=SvV6aUki6LU&list=PLCGIzmTE4d0iCqSmha1X7F-_AqB3jjo26&index=7&ab_channel=FIFA
[youtube:tab] Downloading just the video SvV6aUki6LU because of --no-playlist
[youtube] Extracting URL: https://www.youtube.com/watch?v=SvV6aUki6LU
[youtube] SvV6aUki6LU: Downloading webpage
[youtube] SvV6aUki6LU: Downloading ios player API JSON
[youtube] SvV6aUki6LU: Downloading android player API JSON
[youtube] SvV6aUki6LU: Downloading m3u8 information
[info] SvV6aUki6LU: Downloading 1 format(s): 614+140
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 1207
[download] Destination: data\SvV6aUki6LU.f614.mp4
[download] 100% of    1.70GiB in 00:04:57 at 5.84MiB/s                        
[download] Destination: data\SvV6aUki6LU.f140.m4a
[download] 100% of   94.38MiB in 00:00:08 at 10.59MiB/s    
[Merger] Merging formats into "data\SvV6aUki6LU.mp4"
Deleting original file data\SvV6aUki6LU.f140.m4a (pass -k to keep)
Deleting original fil

## **Create Video Summary**

In [77]:
hf = HighlightsFinder(batch_size=32)

In [78]:
videos_output_path = [f for f in glob.glob(os.path.join(save_videos_to, "*.mp4")) if "_summary" not in f and "ipynb_checkpoints" not in f]

In [79]:
for video_path in videos_output_path[1:]:
    hf.create_video_summary(video_path=video_path, summary_output=summary_output, num_highlights=num_highlights, window=window)