In [7]:
import glob
import math

# warnings.simplefilter(action='ignore')
import os
import shutil
# import warnings
from pathlib import Path
from typing import Any, List, Optional, Tuple, Union
from uuid import uuid4

# import av
import cv2
import librosa
import numpy as np
import torch
from moviepy import editor
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from PIL import Image
from pyAudioAnalysis import MidTermFeatures as mtf
from pyAudioAnalysis import audioTrainTest as at
from pydub import AudioSegment
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics.pairwise import cosine_distances
from sklearn.neighbors import LocalOutlierFactor
from torchvision import models, transforms
from yt_dlp import YoutubeDL

# from pytorchvideo.data.encoded_video import EncodedVideo


np.random.seed(0)

In [8]:
save_videos_to = "data"
window = 10
summary_output = "videos_summary"
num_highlights = 10

## **Helper Functions**

In [10]:
class ImageHighlightsFinder:
    def __init__(self, batch_size: int = 32) -> None:
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.batch_size = batch_size
        model = models.resnet18(pretrained=True)
        self.model = torch.nn.Sequential(*(list(model.children())[:-1])).to(self.device)
        self.model.eval()

    def _get_transformations(self, will_be_saved: bool) -> List[Any]:
        transformations = [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
        if will_be_saved:
            transformations.append(transforms.ToPILImage())
        return transformations

    def _preprocess_image(
        self, image: Image.Image, will_be_saved: bool = False
    ) -> Union[torch.Tensor, Image.Image]:
        transformations = self._get_transformations(will_be_saved=will_be_saved)

        transform = transforms.Compose(transformations)
        image = transform(image)
        if will_be_saved:
            return image

        image = image.unsqueeze(0)
        # print(image.shape)
        return image

    def _chunks(self, lst, n):
        """
        Yield successive n-sized chunks from lst.
        """
        for i in range(0, len(lst), n):
            yield lst[i : i + n]

    def _create_feature_vectors(self, file_paths: List[str]) -> np.ndarray:
        features = None
        for file_paths_chunk in self._chunks(file_paths, n=self.batch_size):
            # Get the data for this batch.
            imgs = [Image.open(img).convert("RGB") for img in file_paths_chunk]
            imgs = [self._preprocess_image(img) for img in imgs]
            imgs = torch.cat(imgs, dim=0).to(self.device)

            # Convert them to features.
            with torch.no_grad():
                f = self.model(imgs)

            if features is None:
                features = f.clone()
            else:
                features = torch.cat((features, f), dim=0)

        features = features.squeeze(dim=-1)
        features = features.squeeze(dim=-1)
        features = features.cpu().detach().numpy()
        return features

    def _extract_frames(
        self,
        video_path: str,
        images_folder: str,
        start_at_sec: int = 5,
        window: int = 10,
    ):
        os.makedirs(images_folder, exist_ok=True)
        vidcap = cv2.VideoCapture(video_path)
        success, image = vidcap.read()
        success = True
        while success:
            vidcap.set(
                cv2.CAP_PROP_POS_MSEC, (start_at_sec * 1000)
            )  # One frame per second.
            success, image = vidcap.read()
            # print("Read a new frame: ", success)
            if success:
                cv2.imwrite(
                    os.path.join(images_folder, f"sec_{start_at_sec}.jpg"), image
                )  # save frame as JPEG file.
            start_at_sec += window

    def get_distances(
        self,
        video_path: str,
        temp_path: str,
        window: int = 10,
    ) -> np.ndarray:
        temp_file_path = os.path.join(temp_path, "image")
        start_at_sec = int(window * 0.5)
        self._extract_frames(
            video_path=video_path,
            images_folder=temp_file_path,
            start_at_sec=start_at_sec,
            window=window,
        )
        file_paths = glob.glob(os.path.join(temp_file_path, "*.jpg"))
        features = self._create_feature_vectors(file_paths=file_paths)
        distances = cosine_distances(features, features)
        del features
        median_distances = np.median(distances, axis=1)
        del distances
        assert median_distances.shape[0] == len(file_paths)
        shutil.rmtree(temp_file_path, ignore_errors=True)
        return median_distances

In [19]:
VideoFileClip

NameError: name 'VideoFileClip' is not defined

In [24]:
from moviepy.editor import VideoFileClip
video_file = './data/SvV6aUki6LU.mp4'
filename, ext = os.path.splitext(video_file)
clip = VideoFileClip(video_file)
clip.audio.write_audiofile(f"{filename}.mp3")


MoviePy - Writing audio in ./data/SvV6aUki6LU.mp3


                                                                          

MoviePy - Done.


In [21]:
filename, ext

('./data/SvV6aUki6LU', '.mp4')

In [None]:
from moviepy.editor import VideoFileClip


def convert_video_to_audio_moviepy(video_file, output_ext="mp3"):

    filename, ext = os.path.splitext(video_file)
    clip = VideoFileClip(video_file)
    clip.audio.write_audiofile(f"{filename}.{output_ext}")

In [None]:
filename, ext = os.path.splitext(video_file)
    subprocess.call(["ffmpeg", "-y", "-i", video_file, f"{filename}.{output_ext}"], 
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)

In [18]:
def _audio_seg(path: str, output_path: str, window: int = 10):
    os.makedirs(output_path, exist_ok=True)
    duration = librosa.get_duration(path=path)
    song = AudioSegment.from_file(path, format="mp3")
    window_ms = 1000 * window
    for j in range(1, math.floor(duration / window) + 1):
        start_ = (j - 1) * window_ms
        end_ = j * window_ms
        seg_ = song[start_:end_]
        seg_.export(
            os.path.join(
                output_path, f"sec_{int(start_ / 1000)}_{int(end_ / 1000)}.mp3"
            ),
            format="mp3",
        )

_audio_seg('./data/SvV6aUki6LU.mp4', './data/audio/', 10)

In [32]:
class AudioHighlightsFinder:
    def __init__(self) -> None:
        pass

    def _audio_seg(self, path: str, output_path: str, window: int = 10):
        os.makedirs(output_path, exist_ok=True)
        duration = librosa.get_duration(path=path)
        song = AudioSegment.from_file(path, format="mp3")
        window_ms = 1000 * window
        for j in range(1, math.floor(duration / window) + 1):
            start_ = (j - 1) * window_ms
            end_ = j * window_ms
            seg_ = song[start_:end_]
            seg_.export(
                os.path.join(
                    output_path, f"sec_{int(start_ / 1000)}_{int(end_ / 1000)}.mp3"
                ),
                format="mp3",
            )

    def _feature_extraction(self, directory: str) -> Tuple[np.ndarray, List[str]]:
        f1, _, feature_names = mtf.directory_feature_extraction(
            directory, 1, 1, 0.1, 0.1
        )
        mid_term_features = [f1]
        # convert list of feature matrices to x, y format:
        x, y = at.features_to_matrix(mid_term_features)
        m = x.mean(axis=0)
        s = np.std(x, axis=0)
        X = (x - m) / s
        return X, feature_names

    def _feature_selection(
        self, X: np.ndarray, feature_names: List[str]
    ) -> Tuple[np.ndarray, List[str]]:
        # Choose Features that have some variability
        threshold = 1
        selector = VarianceThreshold(threshold=threshold)
        X_selected = selector.fit_transform(X)
        selected_feature_indices = selector.get_support(indices=True)
        selected_feature_names = [feature_names[i] for i in selected_feature_indices]
        return X_selected, selected_feature_names

    def _outlier_detection(self, X: np.ndarray, num_high: int):
        clf = LocalOutlierFactor(n_neighbors=20, metric="cosine")
        clf.fit(X)
        outlier_scores = clf.negative_outlier_factor_
        sorted_indices = np.argsort(outlier_scores)
        highlight_indices = sorted_indices[:num_high]
        # print(outlier_detection(X_new, 10))
        return highlight_indices

    def get_distances(
        self,
        video_path: str,
        temp_path: str, 
        window: int = 10,
    ) -> np.ndarray:
        temp_file_path = os.path.join(temp_path, "audio")
        # Load the Video
        video = editor.VideoFileClip(video_path)
        # Extract the Audio
        audio = video.audio
        # Export the Audio
        audio_path = os.path.join(temp_file_path, "all_audio")
        os.makedirs(audio_path, exist_ok=True)
        audio.write_audiofile(os.path.join(audio_path, "audio.mp3"))
        del video, audio

        self._audio_seg(
            path=os.path.join(audio_path, "audio.mp3"),
            output_path=temp_file_path,
            window=window,
        )

        X, feature_names = self._feature_extraction(directory=temp_file_path)
        X, feature_names = self._feature_selection(X=X, feature_names=feature_names)

        distances = cosine_distances(X, X)
        del X
        median_distances = np.median(distances, axis=1)

        # highlights = self._outlier_detection(X=X, num_high=num_highlights)

        # shutil.rmtree(temp_file_path, ignore_errors=True)
        return median_distances

In [35]:
class HighlightsFinder:
    def __init__(self, batch_size: int = 32) -> None:
        self.ahf = AudioHighlightsFinder()
        self.ihf = ImageHighlightsFinder(batch_size=batch_size)
    
    def _str_to_int_tuple(self, s: str) -> Tuple[int, int]:
        start, end = s.split("_")
        start = int(start)
        end = int(end)
        return start, end

    def _merge_timestamps(self, timestamps: List[Tuple]) -> List[Tuple]:
        merged_timestamps = []
        timestamps.sort(key=lambda x: x[0])  # Sort the timestamps based on start time

        for timestamp in timestamps:
            if merged_timestamps and timestamp[0] == merged_timestamps[-1][1]:
                merged_timestamps[-1] = (
                    merged_timestamps[-1][0],
                    timestamp[1],
                )  # Extend the previous timestamp
            else:
                merged_timestamps.append(timestamp)  # Add a new timestamp

        return merged_timestamps

    def _convert_str_to_timestamps(self, highlights: List[str]) -> List[Tuple]:
        timestamps = [
            self._str_to_int_tuple(s=timestamp)
            for timestamp in highlights
        ]
        timestamps = self._merge_timestamps(timestamps=timestamps)
        return timestamps

    def create_video_summary(
        self, video_path: str, summary_output: str, num_highlights: int, window: int
    ):
        temp_file_dir = "temp"
        temp_file_path = os.path.join(temp_file_dir, str(uuid4()))

        # Get the distances from each modality.
        # image_distances = self.ihf.get_distances(
        #     video_path=video_path, temp_path=temp_file_path, window=window
        # )
        audio_distances = self.ahf.get_distances(
            video_path=video_path, temp_path=temp_file_path, window=window
        )
        # assert image_distances.shape[0] == audio_distances.shape[0]

        # # TODO: Add weight to each modality.
        # distances = np.add(image_distances, audio_distances)

        # # Get the idx of the segments with the greater distance.
        # idx = np.argsort(distances)[-num_highlights:]

        # # Load the video.
        # video = editor.VideoFileClip(video_path)
        # # Get the duration of the video in secs.
        # duration = video.duration

        # # Create the timestamps in the same way as they will get processed.
        # timestamps = [
        #     f"{(j - 1) * window}_{j * window}"
        #     for j in range(1, math.floor(duration / window) + 1)
        # ]
        # timestamps = list(sorted(timestamps))

        # # # Get the timestamps of the segments with the greater distance.
        # highlights = np.array(timestamps)[idx].tolist()

        # timestamps = self._convert_str_to_timestamps(highlights=highlights)
        
        # # Create the summary video.
        # clips = []
        # for start_time , end_time in timestamps:
        #     clip = video.subclip(start_time, end_time)
        #     clips.append(clip)
        
        # final = editor.concatenate_videoclips(clips)
        
        # # Get video's name.
        # video_name = os.path.basename(video_path)
        # video_name = Path(video_name).stem
        # os.makedirs(summary_output, exist_ok=True)
        # final.write_videofile(os.path.join(summary_output, f"{video_name}_summary.mp4"))
        # # Delete the temp dir.
        # shutil.rmtree(temp_file_dir, ignore_errors=True)

## **Download Videos**

In [13]:
# links = [
#     # "https://www.youtube.com/watch?v=d0r0vzvqeoc&ab_channel=LubenTV",
#     "https://www.youtube.com/watch?v=SvV6aUki6LU&list=PLCGIzmTE4d0iCqSmha1X7F-_AqB3jjo26&index=7&ab_channel=FIFA",
# ]

# ydl_opts = {"noplaylist": True, "outtmpl": os.path.join(save_videos_to, "%(id)s"), "format": "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4] / bv*+ba/b"}

# with YoutubeDL(ydl_opts) as ydl:
#     ydl.download(links)

## **Create Video Summary**

In [36]:
hf = HighlightsFinder(batch_size=32)

In [28]:
videos_output_path = [f for f in glob.glob(os.path.join(save_videos_to, "*.mp4")) if "_summary" not in f and "ipynb_checkpoints" not in f]

In [29]:
videos_output_path

['data\\SvV6aUki6LU.mp4']

In [38]:
for video_path in videos_output_path:
    hf.create_video_summary(video_path=video_path, summary_output=summary_output, num_highlights=num_highlights, window=window)

In [43]:

X, feature_names = hf.ahf._feature_extraction(directory='./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio/')
X, feature_names = hf.ahf._feature_selection(X=X, feature_names=feature_names)

Analyzing file 1 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_1030_1040.mp3
Analyzing file 2 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_1130_1140.mp3
Analyzing file 3 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_1200_1210.mp3
Analyzing file 4 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_130_140.mp3
Analyzing file 5 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_1340_1350.mp3
Analyzing file 6 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_1440_1450.mp3
Analyzing file 7 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_1510_1520.mp3
Analyzing file 8 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_1610_1620.mp3
Analyzing file 9 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_1730_1740.mp3
Analyzing file 10 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_1740_1750.mp3
Analyzing file 11 of 21: ./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio\sec_19

In [46]:
import whisper
model_whisper = whisper.load_model("base.en")

100%|███████████████████████████████████████| 139M/139M [00:55<00:00, 2.64MiB/s]


In [60]:
prediction = model_whisper.transcribe('./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio/sec_1080_1090.mp3')

In [61]:
prediction

{'text': " Flicking by Greece but I'm flicked on and frauds take the lead in the World Cup final",
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 6.12,
   'text': " Flicking by Greece but I'm flicked on and frauds take the lead in the World Cup",
   'tokens': [50363,
    1610,
    7958,
    416,
    10315,
    475,
    314,
    1101,
    781,
    9484,
    319,
    290,
    7394,
    82,
    1011,
    262,
    1085,
    287,
    262,
    2159,
    5454,
    50669],
   'temperature': 0.0,
   'avg_logprob': -0.5892892250647912,
   'compression_ratio': 1.0493827160493827,
   'no_speech_prob': 0.11426424980163574},
  {'id': 1,
   'seek': 0,
   'start': 6.12,
   'end': 8.68,
   'text': ' final',
   'tokens': [50669, 2457, 50797],
   'temperature': 0.0,
   'avg_logprob': -0.5892892250647912,
   'compression_ratio': 1.0493827160493827,
   'no_speech_prob': 0.11426424980163574}],
 'language': 'en'}

In [54]:
med_model_whisper = whisper.load_model("medium.en")

100%|█████████████████████████████████████| 1.42G/1.42G [05:16<00:00, 4.82MiB/s]


In [59]:
med_model_whisper.transcribe('./temp/51a74d3a-907c-45a7-91cc-62a6a6acd1f1/audio/sec_1080_1090.mp3')

{'text': ' Flicked in by Griezmann and flicked on! And France take the lead in the World Cup Final!',
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 3.0,
   'text': ' Flicked in by Griezmann and flicked on!',
   'tokens': [50363,
    1610,
    9484,
    287,
    416,
    402,
    5034,
    89,
    9038,
    290,
    781,
    9484,
    319,
    0,
    50513],
   'temperature': 0.0,
   'avg_logprob': -0.3637124094469794,
   'compression_ratio': 1.0731707317073171,
   'no_speech_prob': 0.12039558589458466},
  {'id': 1,
   'seek': 0,
   'start': 3.0,
   'end': 7.0,
   'text': ' And France take the lead in the World Cup Final!',
   'tokens': [50513,
    843,
    4881,
    1011,
    262,
    1085,
    287,
    262,
    2159,
    5454,
    8125,
    0,
    50713],
   'temperature': 0.0,
   'avg_logprob': -0.3637124094469794,
   'compression_ratio': 1.0731707317073171,
   'no_speech_prob': 0.12039558589458466}],
 'language': 'en'}

In [62]:
from nrclex import NRCLex  

In [2]:
r"C:\Program Files\Tesseract-OCR\tesseract.exe"

'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

In [6]:
# Import required packages
import cv2
import pytesseract

# Mention the installed location of Tesseract-OCR in your system
pytesseract.pytesseract.tesseract_cmd = "C:/Program Files/Tesseract-OCR/tesseract.exe"

# Read image from which text needs to be extracted
img = cv2.imread("data\SvV6aUki6LU\sec_5975.jpg")

# Preprocessing the image starts

# Convert the image to gray scale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Performing OTSU threshold
ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)

# Specify structure shape and kernel size.
# Kernel size increases or decreases the area
# of the rectangle to be detected.
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 10))

# Applying dilation on the threshold image
dilation = cv2.dilate(thresh1, rect_kernel, iterations = 1)

# Finding contours
contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL,
												cv2.CHAIN_APPROX_NONE)

# Creating a copy of image
im2 = img.copy()

# A text file is created and flushed
file = open("recognized.txt", "w+")
file.write("")
file.close()

# Looping through the identified contours
# Then rectangular part is cropped and passed on
# to pytesseract for extracting text from it
# Extracted text is then written into the text file
for cnt in contours:
	x, y, w, h = cv2.boundingRect(cnt)
	
	# Drawing a rectangle on copied image
	rect = cv2.rectangle(im2, (x, y), (x + w, y + h), (0, 255, 0), 2)
	
	# Cropping the text block for giving input to OCR
	cropped = im2[y:y + h, x:x + w]
	
	# Open the file in append mode
	file = open("recognized.txt", "a")
	
	# Apply OCR on the cropped image
	text = pytesseract.image_to_string(cropped)
	
	# Appending the text into file
	file.write(text)
	file.write("\n")
	
	# Close the file
	file.close
