# Downloading the video

In [2]:
import yt_dlp

def download_video(url, output_path):
    """
    Download a video from a given URL using yt-dlp and save it to the output path.

    Parameters:
    url (str): The URL of the video to download.
    output_path (str): The path to save the video to.

    Returns:
    dict: A dictionary containing the metadata of the video.
    """
    try:
        ydl_opts = {
            'outtmpl': f'{output_path}/input_vid.%(ext)s',
            'format': 'bestvideo+bestaudio/best',
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            metadata = {
                "Author": info.get("uploader"),
                "Title": info.get("title"),
                "Views": info.get("view_count"),
            }
            return metadata
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


# Downloading the Video, the information and metadata

In [None]:
import yt_dlp
# Define output template with metadata
ydl_opts = {
    'outtmpl': '%(title).100s - %(uploader)s - %(view_count)s views.%(ext)s',
    # Limit title to 100 characters to avoid file system issues
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([url])

Deprecated Feature: Support for Python version 3.8 has been deprecated. Please update to Python 3.9 or above


Defaulting to user installation because normal site-packages is not writeable
[youtube] Extracting URL: https://www.youtube.com/watch?v=dARr3lGKwk8
[youtube] dARr3lGKwk8: Downloading webpage
[youtube] dARr3lGKwk8: Downloading ios player API JSON
[youtube] dARr3lGKwk8: Downloading mweb player API JSON
[youtube] dARr3lGKwk8: Downloading player 8102da6c


         player = https://www.youtube.com/s/player/8102da6c/player_ias.vflset/en_US/base.js
         n = UTdH8TtAOjSGACiR ; player = https://www.youtube.com/s/player/8102da6c/player_ias.vflset/en_US/base.js
         player = https://www.youtube.com/s/player/8102da6c/player_ias.vflset/en_US/base.js
         n = 7n_UP1EDa_R4laJo ; player = https://www.youtube.com/s/player/8102da6c/player_ias.vflset/en_US/base.js


[youtube] dARr3lGKwk8: Downloading m3u8 information
[info] dARr3lGKwk8: Downloading 1 format(s): 137+251
[download] Destination: Parameterized Complexity of token sliding, token jumping - Amer Mouawad [dARr3lGKwk8].f137.mp4
[download] 100% of   77.76MiB in 00:00:23 at 3.24MiB/s     
[download] Destination: Parameterized Complexity of token sliding, token jumping - Amer Mouawad [dARr3lGKwk8].f251.webm
[download] 100% of   45.34MiB in 00:00:14 at 3.14MiB/s   
[Merger] Merging formats into "Parameterized Complexity of token sliding, token jumping - Amer Mouawad [dARr3lGKwk8].mkv"
Deleting original file Parameterized Complexity of token sliding, token jumping - Amer Mouawad [dARr3lGKwk8].f251.webm (pass -k to keep)
Deleting original file Parameterized Complexity of token sliding, token jumping - Amer Mouawad [dARr3lGKwk8].f137.mp4 (pass -k to keep)


# Video Processing

## Extract frame changes and their respective time-stamps using OpenAI SceneManager

In [None]:
import os
import cv2
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
from scenedetect.video_splitter import split_video_ffmpeg

def save_scene_frames_with_timestamps(video_path, output_folder, threshold=5.0):
    os.makedirs(output_folder, exist_ok=True)

    # Setup SceneDetect
    video_manager = VideoManager([video_path])
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector(threshold=threshold))

    video_manager.set_downscale_factor()
    video_manager.start()

    # Detect scenes
    scene_manager.detect_scenes(frame_source=video_manager)
    scene_list = scene_manager.get_scene_list()

    print(f"{len(scene_list)} scenes detected.")

    # Open original video with OpenCV for frame extraction
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Save timestamps
    timestamp_path = os.path.join(output_folder, "timestamps.txt")
    with open(timestamp_path, "w") as timestamp_file:
        for i, (start_time, _) in enumerate(scene_list):
            # Convert start timecode to frame number
            start_frame_num = int(start_time.get_frames())

            # Seek to the frame
            cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame_num)
            ret, frame = cap.read()
            if ret:
                # Save frame
                frame_filename = os.path.join(output_folder, f"scene_{i+1:04d}.png")
                cv2.imwrite(frame_filename, frame)

                # Save timestamp
                timestamp_file.write(f"Scene {i+1:04d}: {start_time.get_seconds():.2f} seconds\n")
                print(f"Saved scene {i+1} at {start_time.get_timecode()}")

    cap.release()
    print(f"Scene frames and timestamps saved in: {output_folder}")

if __name__ == "__main__":
    video_path = 'Parameterized Complexity of token sliding, token jumping - Amer Mouawad [dARr3lGKwk8].mkv'
    output_folder = 'scene_changes_output'
    save_scene_frames_with_timestamps(video_path, output_folder)


  from .autonotebook import tqdm as notebook_tqdm
VideoManager is deprecated and will be removed.


57 scenes detected.
Saved scene 1 at 00:00:00.000
Saved scene 2 at 00:00:00.680
Saved scene 3 at 00:01:53.400
Saved scene 4 at 00:03:16.880
Saved scene 5 at 00:03:20.520
Saved scene 6 at 00:04:50.120
Saved scene 7 at 00:06:16.800
Saved scene 8 at 00:08:08.440
Saved scene 9 at 00:09:26.800
Saved scene 10 at 00:09:46.840
Saved scene 11 at 00:11:12.480
Saved scene 12 at 00:12:00.680
Saved scene 13 at 00:12:12.880
Saved scene 14 at 00:12:48.240
Saved scene 15 at 00:14:57.400
Saved scene 16 at 00:16:18.040
Saved scene 17 at 00:16:48.680
Saved scene 18 at 00:17:40.200
Saved scene 19 at 00:18:16.640
Saved scene 20 at 00:18:55.440
Saved scene 21 at 00:21:56.240
Saved scene 22 at 00:22:48.680
Saved scene 23 at 00:23:24.680
Saved scene 24 at 00:23:45.840
Saved scene 25 at 00:23:53.560
Saved scene 26 at 00:25:05.600
Saved scene 27 at 00:28:42.960
Saved scene 28 at 00:29:10.120
Saved scene 29 at 00:29:24.160
Saved scene 30 at 00:30:00.680
Saved scene 31 at 00:30:40.880
Saved scene 32 at 00:30:54.2

## Extract frame changes and their respective time-stamps by considering the frame contrast change

In [None]:
import cv2
import os
import numpy as np

def calculate_frame_difference(prev_frame, current_frame):
    """
    Calculates the mean squared error between two frames.
    """
    # Convert frames to grayscale
    prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    current_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
    
    # Compute the absolute difference between the frames
    difference = cv2.absdiff(prev_gray, current_gray)
    
    # Sum the differences to get a score
    score = np.sum(difference)
    
    return score

def save_frames_with_timestamps(video_path, output_folder, threshold=1000000):
    """
    Extracts frames from a video and saves each frame as an image in the specified folder,
    while also saving the timestamp of each frame, only when a scene change is detected.

    Parameters:
    - video_path: The path to the input video file (MKV format).
    - output_folder: The folder where the frames and timestamps will be saved.
    - threshold: The difference threshold to detect scene changes (higher = less sensitivity).
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Open the video file
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Couldn't open video.")
        return

    frame_count = 0
    fps = cap.get(cv2.CAP_PROP_FPS)  # Get frames per second of the video

    # Read the first frame
    ret, prev_frame = cap.read()
    if not ret:
        print("Error: Couldn't read the first frame.")
        return

    # Open a file to save timestamps
    timestamp_filename = os.path.join(output_folder, "timestamps.txt")
    with open(timestamp_filename, 'w') as timestamp_file:
        while True:
            ret, current_frame = cap.read()
            if not ret:
                break  # End of video

            # Calculate the difference between the previous and current frame
            frame_diff = calculate_frame_difference(prev_frame, current_frame)

            # If the difference exceeds the threshold, consider it a scene change
            if frame_diff > threshold:
                # Calculate the timestamp for the current frame
                timestamp = frame_count / fps  # Time in seconds

                # Save the frame as an image (e.g., PNG format)
                frame_filename = os.path.join(output_folder, f"frame_{frame_count:04d}.png")
                cv2.imwrite(frame_filename, current_frame)

                # Save the timestamp in the text file
                timestamp_file.write(f"Frame {frame_count:04d}: {timestamp:.4f} seconds\n")

            # Update the previous frame to be the current frame
            prev_frame = current_frame
            frame_count += 1

    cap.release()
    print(f"Frames extraction completed. Timestamps saved to {timestamp_filename}.")

# Example usage:
video_path = 'Parameterized Complexity of token sliding, token jumping - Amer Mouawad [dARr3lGKwk8].mkv'
output_folder = 'frames_output_folder'
save_frames_with_timestamps(video_path, output_folder)


## Seperating audio from the video

In [9]:
from moviepy import VideoFileClip
import os
import speech_recognition as sr
filepath = 'Parameterized Complexity of token sliding, token jumping - Amer Mouawad [dARr3lGKwk8].mkv'
output_audio_path = 'audio.wav'
def extract_audio_from_video(video_path, output_audio_path):
    """
    Extract audio from a video file and save it to the specified path.

    Parameters:
    video_path (str): The path to the video file.
    output_audio_path (str): The path to save the extracted audio.

    """
    clip = VideoFileClip(video_path)
    audio = clip.audio
    audio.write_audiofile(output_audio_path)
    
    clip.close()
    print(f"Audio extracted and saved to {output_audio_path}")
extract_audio_from_video(filepath, output_audio_path)

{'video_found': True, 'audio_found': True, 'metadata': {'COMPATIBLE_BRANDS': 'iso6avc1mp41', 'MAJOR_BRAND': 'dash', 'MINOR_VERSION': '0', 'ENCODER': 'Lavf62.0.102'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1920, 1080], 'bitrate': None, 'fps': 25.0, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'HANDLER_NAME': 'ISO Media file produced by Google Inc.', 'VENDOR_ID': '[0][0][0][0]', 'DURATION': '01:02:37.600000000'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 48000, 'bitrate': None, 'metadata': {'Metadata': '', 'DURATION': '01:02:37.628000000'}}], 'input_number': 0}], 'duration': 3757.63, 'bitrate': 273, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(High)', 'video_size': [1920, 1080], 'video_bitrate': None, 'video_fps': 25.0, 'de

                                                                        

MoviePy - Done.
Audio extracted and saved to audio.wav




## Transcribing the audio

In [None]:
import whisper
import torch
from faster_whisper import WhisperModel

def audio_to_text_whisper(audio_path, output_text_file, gap_threshold=2.5):

    """
    Transcribes audio using Whisper, chunks it into paragraphs based on silence gaps,
    and saves each paragraph with the starting timestamp.

    Parameters:
    - audio_path (str): Path to the input audio file.
    - output_text_file (str): Path to save the paragraph-formatted transcription text.
    - gap_threshold (float): Minimum gap (in seconds) to define a new paragraph.
    """
    model = whisper.load_model("base",device="cuda")

    print("Transcribing audio with Whisper...")
    result = model.transcribe(audio_path, verbose=False, word_timestamps=False)

    paragraphs = []
    current_paragraph = ""
    current_start = None
    prev_end = None

    for segment in result["segments"]:
        start = segment["start"]
        end = segment["end"]
        text = segment["text"].strip()

        # If this is the first segment or there's a long pause, start a new paragraph
        if current_start is None or (start - prev_end) > gap_threshold:
            if current_paragraph:
                paragraphs.append((current_start, current_paragraph.strip()))
            current_paragraph = text
            current_start = start
        else:
            current_paragraph += " " + text

        prev_end = end

    # Append last paragraph
    if current_paragraph:
        paragraphs.append((current_start, current_paragraph.strip()))

    # Save to file
    with open(output_text_file, "w", encoding="utf-8") as f:
        for i, (timestamp, paragraph) in enumerate(paragraphs, 1):
            minutes = int(timestamp // 60)
            seconds = int(timestamp % 60)
            time_str = f"{minutes:02d}:{seconds:02d}"
            f.write(f"[{time_str}] Paragraph {i}:\n{paragraph}\n\n")

    print(f"Paragraph transcription saved to {output_text_file}")
    return paragraphs



def audio_to_text_faster_whisper(audio_path, output_text_file, gap_threshold=2.5, model_size="base"):
    """
    Transcribes audio using Faster-Whisper, chunks it into paragraphs based on silence gaps,
    and saves each paragraph with the starting timestamp.

    Parameters:
    - audio_path (str): Path to the input audio file.
    - output_text_file (str): Path to save the paragraph-formatted transcription text.
    - gap_threshold (float): Minimum gap (in seconds) to define a new paragraph.
    - model_size (str): Model variant to use, e.g., "base", "small", "medium", "large-v2".
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    compute_type = "float16" if device == "cuda" else "int8"

    print(f"Loading Faster-Whisper model ({model_size}) on {device}...")
    model = WhisperModel(model_size, device=device, compute_type=compute_type)

    print("Transcribing audio with Faster-Whisper...")
    segments, _ = model.transcribe(audio_path, word_timestamps=False)

    paragraphs = []
    current_paragraph = ""
    current_start = None
    prev_end = None

    for segment in segments:
        start = segment.start
        end = segment.end
        text = segment.text.strip()

        if current_start is None or (start - prev_end) > gap_threshold:
            if current_paragraph:
                paragraphs.append((current_start, current_paragraph.strip()))
            current_paragraph = text
            current_start = start
        else:
            current_paragraph += " " + text

        prev_end = end

    # Append last paragraph
    if current_paragraph:
        paragraphs.append((current_start, current_paragraph.strip()))

    # Save to file
    with open(output_text_file, "w", encoding="utf-8") as f:
        for i, (timestamp, paragraph) in enumerate(paragraphs, 1):
            minutes = int(timestamp // 60)
            seconds = int(timestamp % 60)
            time_str = f"{minutes:02d}:{seconds:02d}"
            f.write(f"[{time_str}] Paragraph {i}:\n{paragraph}\n\n")

    print(f"Paragraph transcription saved to {output_text_file}")
    return paragraphs

if __name__ == "__main__":
    audio_path = "audio.wav"
    output_text_file = "transcription.txt"
    paragraphs = audio_to_text_whisper(audio_path, output_text_file)

    output_text_file = "transcription2.txt"
    paragraphs = audio_to_text_faster_whisper(audio_path, output_text_file)



Transcribing from file...


RequestError: recognition request failed: Service Unavailable

## embedding the frames

In [1]:
import os
import json
import faiss
import numpy as np
from PIL import Image
import open_clip
import torch
from torchvision import transforms


def load_timestamps(timestamp_file):
    timestamps = {}
    with open(timestamp_file, "r") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split(": ")
            if len(parts) == 2:
                frame_id = parts[0].replace("Frame ", "").zfill(4)
                timestamp = float(parts[1].split()[0])
                timestamps[f"frame_{frame_id}.png"] = timestamp
    return timestamps


def preprocess_image(image_path, transform):
    image = Image.open(image_path).convert("RGB")
    return transform(image).unsqueeze(0)


def embed_images(image_folder, timestamps):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')
    model = model.to(device)
    model.eval()

    embeddings = []
    metadata = []

    image_files = sorted([f for f in os.listdir(image_folder) if f.endswith(".png")])

    for image_name in image_files:
        image_path = os.path.join(image_folder, image_name)
        try:
            image_tensor = preprocess_image(image_path, preprocess).to(device)
            with torch.no_grad():
                image_embedding = model.encode_image(image_tensor).cpu().numpy().flatten()

            ts = timestamps.get(image_name, -1)
            metadata.append({
                "filename": image_name,
                "timestamp": ts
            })
            embeddings.append(image_embedding)
        except Exception as e:
            print(f"Skipping {image_name}: {e}")

    if not embeddings:
        raise ValueError("No image embeddings were generated. Check image input and model compatibility.")

    return np.vstack(embeddings).astype("float32"), metadata


def save_faiss_index(embeddings, index_path):
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, index_path)


def save_metadata(metadata, metadata_path):
    with open(metadata_path, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=2)


def build_image_rag_database(image_folder, timestamp_file, index_output_path="image_rag_index.faiss", metadata_output_path="image_metadata.json"):
    print("Loading timestamps...")
    timestamps = load_timestamps(timestamp_file)

    print("Embedding images...")
    embeddings, metadata = embed_images(image_folder, timestamps)

    print("Saving FAISS index...")
    save_faiss_index(embeddings, index_output_path)

    print("Saving metadata...")
    save_metadata(metadata, metadata_output_path)

    print(f"Done. {len(metadata)} images processed.")


# Example usage
if __name__ == "__main__":
    image_folder = "scene_changes_output"
    timestamp_file = os.path.join(image_folder, "timestamps.txt")
    build_image_rag_database(image_folder, timestamp_file)


  from .autonotebook import tqdm as notebook_tqdm


Loading timestamps...
Embedding images...




Saving FAISS index...
Saving metadata...
Done. 57 images processed.


## Embedding the text by chunking each paragraph

In [11]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer
import re

def load_transcription_paragraphs(filepath):
    """
    Loads transcription paragraphs from a file with the format:
    [MM:SS] Paragraph n:\n<text>\n\n

    Returns:
        List of (timestamp_str, text) tuples.
    """
    paragraphs = []
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()
    
    entries = re.findall(r"\[(\d{2}:\d{2})\] Paragraph \d+:\n(.+?)(?=\n\n|\Z)", content, re.DOTALL)
    for timestamp, text in entries:
        paragraphs.append((timestamp.strip(), text.strip()))
    
    return paragraphs


def embed_with_overlap(paragraphs, model_name="sentence-transformers/all-MiniLM-L6-v2", window_size=3, stride=1):
    """
    Embeds text with overlapping windows.

    Parameters:
    - paragraphs (list): List of (timestamp, text) tuples.
    - model_name (str): SentenceTransformer model name.
    - window_size (int): Number of paragraphs per window.
    - stride (int): Number of paragraphs to slide forward each step.

    Returns:
    - List of dictionaries with 'start_time', 'end_time', 'text', 'embedding'.
    """
    model = SentenceTransformer(model_name)
    embeddings = []

    for i in range(0, len(paragraphs) - window_size + 1, stride):
        chunk = paragraphs[i:i + window_size]
        timestamps = [p[0] for p in chunk]
        texts = [p[1] for p in chunk]
        full_text = " ".join(texts)

        emb = model.encode(full_text)
        embeddings.append({
            "start_time": timestamps[0],
            "end_time": timestamps[-1],
            "text": full_text,
            "embedding": emb
        })

    return embeddings

def embed_with_phrase_overlap(paragraphs, model_name="sentence-transformers/all-MiniLM-L6-v2", window_size=5, overlap=2):
    """
    Embeds transcription text with overlapping windows.
    Each window includes the last `overlap` phrases from the previous window.

    Parameters:
    - paragraphs (list): List of (timestamp, text) tuples.
    - model_name (str): SentenceTransformer model name.
    - window_size (int): Total number of phrases per window.
    - overlap (int): Number of overlapping phrases between windows.

    Returns:
    - List of dicts with 'start_time', 'end_time', 'text', 'embedding'.
    """
    assert overlap < window_size, "Overlap must be smaller than window size"

    model = SentenceTransformer(model_name)
    embeddings = []

    step = window_size - overlap
    i = 0

    while i < len(paragraphs):
        chunk = paragraphs[i:i + window_size]
        if len(chunk) == 0:
            break

        timestamps = [p[0] for p in chunk]
        texts = [p[1] for p in chunk]
        full_text = " ".join(texts)

        emb = model.encode(full_text)
        embeddings.append({
            "start_time": timestamps[0],
            "end_time": timestamps[-1],
            "text": full_text,
            "embedding": emb
        })

        i += step

    return embeddings
filepath = "transcription.txt"
paragraphs = load_transcription_paragraphs(filepath)
overlap_embeddings = embed_with_phrase_overlap(paragraphs, window_size=5, overlap=2)

print(overlap_embeddings[0]["start_time"], overlap_embeddings[0]["embedding"].shape)


00:00 (384,)


# Byuilding the search engine

In [None]:
import re
import numpy as np
import hnswlib
import faiss
from sentence_transformers import SentenceTransformer

def load_and_group_srt_with_overlap(srt_file, group_size=5, overlap_size=2):
    pattern = re.compile(r'\d+\s*\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\s*\n(.*?)\n(?=\d+\n|\Z)', re.DOTALL)
    with open(srt_file, 'r', encoding='utf-8') as f:
        srt_content = f.read()
    matches = pattern.findall(srt_content)

    # Extract and clean text lines + timestamps
    timestamps = [(start, end) for start, end, _ in matches]
    phrases = [text.replace('\n', ' ').strip() for _, _, text in matches]

    # Group with overlap
    paragraphs = []
    paragraph_timestamps = []
    i = 0
    while i < len(phrases):
        group = " ".join(phrases[i:i + group_size])
        group_timestamps = timestamps[i:i + group_size]

        if group_timestamps:
            start_time = group_timestamps[0][0]
            end_time = group_timestamps[-1][1]
            paragraph_timestamps.append((start_time, end_time))
        else:
            paragraph_timestamps.append(("00:00:00,000", "00:00:00,000"))

        paragraphs.append(group)
        i += group_size - overlap_size  # slide forward with overlap

    return paragraphs, paragraph_timestamps

def embed_and_save(paragraphs, timestamps=None, model_name='all-MiniLM-L6-v2', 
                   index_path='hnsw_index.bin', embeddings_path='embeddings.npy', 
                   timestamps_path='timestamps.npy', faiss_index_path='index.faiss'):
    # Initialize model
    model = SentenceTransformer(model_name)

    # Generate embeddings
    print("Generating embeddings...")
    embeddings = model.encode(paragraphs, show_progress_bar=True)
    embeddings = np.array(embeddings, dtype=np.float32)

    # Normalize embeddings for cosine similarity
    faiss.normalize_L2(embeddings)

    # HNSWlib index
    dim = embeddings.shape[1]
    num_elements = embeddings.shape[0]
    index = hnswlib.Index(space='cosine', dim=dim)
    index.init_index(max_elements=num_elements, ef_construction=200, M=16)
    index.add_items(embeddings, np.arange(num_elements))
    index.save_index(index_path)
    print(f"HNSWlib index saved to: {index_path}")

    # Save raw embeddings
    np.save(embeddings_path, embeddings)
    print(f"Embeddings saved to: {embeddings_path}")

    # Save timestamps
    if timestamps is not None:
        np.save(timestamps_path, np.array(timestamps))
        print(f"Timestamps saved to: {timestamps_path}")

    # FAISS index (inner product, equivalent to cosine after normalization)
    faiss_index = faiss.IndexFlatIP(dim)
    faiss_index.add(embeddings)
    faiss.write_index(faiss_index, faiss_index_path)
    print(f"FAISS index saved to: {faiss_index_path}")

    return index, embeddings

# Process Whisper + all-MiniLM-L6-v2
paragraphs1, timestamps1 = load_and_group_srt_with_overlap('Whisper.srt',  group_size=5, overlap_size=2)
embed_and_save(paragraphs1, timestamps1, model_name='all-MiniLM-L6-v2',
               index_path='hnsw_Whisper_all-MiniLM-L6-v2.bin',
               embeddings_path='Whisper_all-MiniLM-L6-v2_embeddings.npy',
               timestamps_path='Whisper_all-MiniLM-L6-v2_timestamps.npy',
               faiss_index_path='faiss_Whisper_all-MiniLM-L6-v2.index')

# Process Faster Whisper + all-MiniLM-L6-v2
paragraphs2, timestamps2 = load_and_group_srt_with_overlap('Faster_Whisper.srt',  group_size=5, overlap_size=2)
embed_and_save(paragraphs2, timestamps2, model_name='all-MiniLM-L6-v2',
               index_path='hnsw_Faster_Whisper_all-MiniLM-L6-v2.bin',
               embeddings_path='Faster_Whisper_all-MiniLM-L6-v2_embeddings.npy',
               timestamps_path='Faster_Whisper_all-MiniLM-L6-v2_timestamps.npy',
               faiss_index_path='faiss_Faster_Whisper_all-MiniLM-L6-v2.index')

# Process Whisper + multi-qa-MiniLM-L6-cos-v1
paragraphs1, timestamps1 = load_and_group_srt_with_overlap('Whisper.srt',  group_size=5, overlap_size=2)
embed_and_save(paragraphs1, timestamps1, model_name='multi-qa-MiniLM-L6-cos-v1',
               index_path='hnsw_Whisper_multi-qa-MiniLM-L6-cos-v1_embeddings.bin',
               embeddings_path='Whisper_multi-qa-MiniLM-L6-cos-v1_embeddings.npy',
               timestamps_path='Whisper_multi-qa-MiniLM-L6-cos-v1_timestamps.npy',
               faiss_index_path='faiss_Whisper_multi-qa-MiniLM-L6-cos-v1.index')

# Process Faster Whisper + multi-qa-MiniLM-L6-cos-v1
paragraphs2, timestamps2 = load_and_group_srt_with_overlap('Faster_Whisper.srt',  group_size=5, overlap_size=2)
embed_and_save(paragraphs2, timestamps2, model_name='multi-qa-MiniLM-L6-cos-v1',
               index_path='hnsw_Faster_Whisper_multi-qa-MiniLM-L6-cos-v1_embeddings.bin',
               embeddings_path='Faster_Whisper_multi-qa-MiniLM-L6-cos-v1_embeddings.npy',
               timestamps_path='Faster_Whisper_multi-qa-MiniLM-L6-cos-v1_timestamps.npy',
               faiss_index_path='faiss_Faster_Whisper_multi-qa-MiniLM-L6-cos-v1.index')




## 

# Fine_tuning a small LLM (TinyLLaMA)

In [1]:
import json
from datasets import Dataset
import torch
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

model = AutoModelForCausalLM.from_pretrained("distilgpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load your JSON file
with open("Data_fine_tuning.json") as f:
    data = json.load(f)

# Format into prompt-completion pairs
def format_example(example):
    prompt = f"Q: {example['question']}\nA:"
    completion = f" {example['answer']} {example['timestamp']}"
    return {"prompt": prompt, "completion": completion}

formatted_data = [format_example(item) for item in data]

# Create a Hugging Face dataset
dataset = Dataset.from_list(formatted_data)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT2 doesn't have pad token by default

def tokenize(batch):
    prompts = [p + c for p, c in zip(batch["prompt"], batch["completion"])]
    return tokenizer(prompts, padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize, batched=True)


training_args = TrainingArguments(
    output_dir="./finetuned-distilgpt2",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    save_total_limit=2,
    fp16=True,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()
def ask_question(question):
    input_text = f"Q: {question}\nA:"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=100, do_sample=True)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))





Map:   0%|          | 0/38 [00:00<?, ? examples/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,3.9682
20,3.1622
30,2.8211


In [3]:
ask_question("Who is the Speaker?")
ask_question("Which university is he from?")
ask_question("does the video mention any details about lightsabers?")
ask_question("Who is the emperor?")
ask_question("What is the main topic of the video?")
ask_question("If unsurem answer with I don't know, What is the main topic of the video?")
ask_question("What is the topic of discussion?")
ask_question("Who let the dogs out?")
ask_question("What is Token sliding?")
ask_question("What is Token jumping?")
ask_question("What is the main topic of the video?")
ask_question("Who is the speaker, and from which university does he come from?")
ask_question("If unsure, answer with 'I don't know'. What is the main topic of the video?")
ask_question("does this video mention Batman's true Identity?")
ask_question("Which puzzle did this video mention?")
ask_question("Which character is stronger according to the video, bombardino crocodile or tralalero tralala")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: Who is the Speaker?
A: First, the Speaker of Congress is from an institution where the speaker is elected. Second, the speaker is chosen by a vote in Congress within 8-10 minutes. Third, the speaker is elected in an hour by minutes by a vote or 40 minutes (depending on the length of the speaker). 4:40–5:30:30:00 C-tailed test: 2:15–5:30:55:00 C-tailed test


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: Which university is he from?
A: It is the University of Southern Illinois, University of Colorado and the University of Oregon. The research was conducted in 2010 at the University of Georgia and was conducted as part of an ongoing work on the topic of the role of cross-hairs in the problem problem.[7]
A: What is the role of cross-hairs in problem problems?
A: The result: Cross-hairs in problems is relatively similar to the problem


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: does the video mention any details about lightsabers?
A: Yes, we mentioned these features, but did not mention other details about it. For example, when moving a circle with a circle on the graph, the feature clearly states they have different objects and can be seen in 2–3 diagrams.
A: What was omitted with the image is that the same triangles are a bit larger and 3-yield triangles?
A: Yes, the idea of a two-


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: Who is the emperor?
A: Yes, it is known that the emperor is king (Q3:28–32). One could see in the book that the emperor was king of kingdoms of the king’s brother, as opposed to those two members and the queen itself, in which case the emperor was king. 53:58–55:13–43:40–56:00–57:15:55–58:10:00–53:00–


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is the main topic of the video?
A: The video can be used to teach about the problem solving: what is the problem? (A:00:06–4:59–4:30–4:30–4:40–4:50–4:45–4:50–4:50–4:50–4:50–4:50–4:50–4:50–4:50–4:50–4:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: If unsurem answer with I don't know, What is the main topic of the video?
A: No, yes, yes and no: 1:00–2:40:30-3:00:00–1:00:00:30–2:00:00:30–3:00:30–3:00:00:30–4:00:00:00–5:00:00:00:00:00:00–


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is the topic of discussion?
A: The topic of topic will focus on the question about how to make decisions according to what is important in complex problems. The topic will focus on a particular topic, or a common question, and the following topic will highlight a specific topic: what is related to a problem: why does "mathematical reduction" be one of the main areas of problem solving problem?
A: No, the topic will focus on several topics: a more


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: Who let the dogs out?
A: The most common problem is that dogs may not do the job due to a fact that the dog is unable to move at least 50 yards or 100 yards. This is solved by a number of problems using simple trigonometric calculations or by finding out if a second dog can move the same length and it is easy to find out that the second dog has not moved from the same distance. 3:40–4:55–5:00:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is Token sliding?
A: Token slide is a very fast, long, and easily obtainable (often 10–40–25-minute). Token slide slides are relatively portable, and allow the use of a simple table (40–50 kHz).
A: What is the time period of token slide slides being shown in FIGU 12–39–46?
A: Token slide is a very fast, full-time, short-lived, and short-lived,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is Token jumping?
A: Token Jump describes jumping which is a jumping move, which means that token jumping is only allowed for movement on a given field. More on Token Jump at http://github.com/miller2hq/token-jump-pigging-over-lucky-step-0/3/ (Zanovsky)
A: Token Jump describes jumping and how tokens can be fixed and how to move on a space-space-free


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is the main topic of the video?
A: This is an answer to an important question. For example, for instance, this is a question on a computer screen that contains 4 bytes of memory. Similarly, for instance, in the video, this question has been presented to the audience, and thus has been a problem of solving the problem. Thus the problem is explained in the following sentence.
A: This question is related to the problem of solving the problem of problem with


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: Who is the speaker, and from which university does he come from?
A: This speaker is a student paper and can be used to estimate the figure size, and is a speaker. It will be helpful if the question arises. A: What is a student paper? A: It is a paper, and will be available for discussion. A: How does an audience size measure the figure size, and how can students describe it? A: Students can describe the figure size, and


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: If unsure, answer with 'I don't know'. What is the main topic of the video?
A: There is a discussion on the topic above where people can answer about whether the concept: [5](1) => [2](1) => [3] => [4] => [5] => [6] => [7] => [8] => [9] => [10] => [10] => [11] => [12] => [13


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: does this video mention Batman's true Identity?
A: Yes it. Yes, it was that Batman's identity, but not the two, was different.
A: Yes. This was a small-scale level design exercise, which was done in the presence of blacking out every sequence. (1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (13) (14


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: Which puzzle did this video mention?
A: The answer is: No–No–No–All–One–One–one–one–one–one–one–one–one– one–one–one–one–one–one–1–3–2–4–3—4–5–4–5–5–6/4:1–4:1–4:1–5:1–5:1–5:1–
Q: Which character is stronger according to the video, bombardino crocodile or tralalero tralala
A: No, there is strong resistance between the two characters and a fixed set of polygons (4–5:50–7:30–5:50:00–5:00 – 4:60:00:00– 4:00:00–4:00:00– 4:50:00:00– 4:50:00–4:


# Generating the .srt file (with the purpose of loading it as a subtitle though it did not work)

In [None]:

import whisper
import torch

# Load the model (tiny, base, small, medium, or large)
model = whisper.load_model("base",device="cuda")

# Transcribe the audio file
result = model.transcribe("audio.wav", verbose=True)

# Save as SRT
with open("transcription.srt", "w", encoding="utf-8") as srt_file:
    for i, segment in enumerate(result["segments"]):
        start = segment["start"]
        end = segment["end"]
        text = segment["text"].strip()

        # Format timestamps
        def format_timestamp(seconds):
            h = int(seconds // 3600)
            m = int((seconds % 3600) // 60)
            s = seconds % 60
            return f"{h:02}:{m:02}:{s:06.3f}".replace('.', ',')

        srt_file.write(f"{i + 1}\n")
        srt_file.write(f"{format_timestamp(start)} --> {format_timestamp(end)}\n")
        srt_file.write(f"{text}\n\n")


100%|████████████████████████████████████████| 139M/139M [06:30<00:00, 372kiB/s]


Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:08.160]  So, hello everyone. Welcome to the PC Seminar. Today we have this Professor Amir Mohad from
[00:08.160 --> 00:13.040]  American University of Beirut and he'll be talking on the GERT and Parametri's complex
[00:13.040 --> 00:18.080]  city of token sliding and token jumping. Thank you for joining us, Professor. Over to you now.
[00:19.440 --> 00:23.760]  Thank you, President. Thank you for having me. It's a real pleasure to be here.
[00:23.760 --> 00:32.640]  So, all right, let's jump right into it. So, since I did not really know the audience too well,
[00:32.640 --> 00:39.920]  I made the assumption that many of you maybe have not seen this area of combinatorial
[00:39.920 --> 00:46.400]  reconfiguration problems. So, I decided what I'm going to do is I'm going to give a gentle introduction
[00:46.400 --> 00:51.920]  to the area just to sho

In [27]:
from faster_whisper import WhisperModel

# Load the model (options: 'tiny', 'base', 'small', 'medium', 'large-v2')
model = WhisperModel("base", device="cpu", compute_type="int8_float32")

# Transcribe the audio file
segments, info = model.transcribe("audio.wav", beam_size=5, language="en")

# Save as SRT
with open("transcription2.srt", "w", encoding="utf-8") as srt_file:
    for i, segment in enumerate(segments):
        start = segment.start
        end = segment.end
        text = segment.text.strip()

        # Format timestamps
        def format_timestamp(seconds):
            h = int(seconds // 3600)
            m = int((seconds % 3600) // 60)
            s = seconds % 60
            return f"{h:02}:{m:02}:{s:06.3f}".replace('.', ',')

        srt_file.write(f"{i + 1}\n")
        srt_file.write(f"{format_timestamp(start)} --> {format_timestamp(end)}\n")
        srt_file.write(f"{text}\n\n")
