# Read and Transform

In [1]:
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi

In [2]:
with open("videos.txt", 'r') as file:
    videos = [line.strip() for line in file]

transcripts = [YouTubeTranscriptApi.get_transcript(video_id) for video_id in videos]

In [3]:
from datetime import timedelta

import srt


TRIGGER_LENGTH = 750  # 30-60 seconds

def merge(subtitles, idx):
    new_content = combine_content(subtitles)

    # preserve start as timedelta
    new_start = seconds_float_to_timedelta(subtitles[0]["start"])
    # merge durations as timedelta
    new_duration = seconds_float_to_timedelta(sum(sub["duration"] for sub in subtitles))
    
    # combine
    new_end = new_start + new_duration
    
    return srt.Subtitle(index=idx, start=new_start, end=new_end, content=new_content)


def combine_content(subtitles):
    contents = [subtitle["text"].strip() for subtitle in subtitles]
    return " ".join(contents) + "\n\n"


def get_charcount(subtitle):
    return len(subtitle["text"])


def seconds_float_to_timedelta(x_seconds):
    return timedelta(seconds=x_seconds)


def merge_subtitles(subtitles):
    merged_subtitles = []
    current_chunk, current_length, chunk_idx = [], 0, 1

    for subtitle in subtitles:
        current_chunk.append(subtitle)
        added_length = get_charcount(subtitle)
        new_length = current_length + added_length

        if new_length >= TRIGGER_LENGTH:
            merged_subtitle = merge(current_chunk, chunk_idx)
            merged_subtitles.append(merged_subtitle)
            current_chunk, current_length = [], 0
            chunk_idx += 1
        else:
            current_length = new_length

    if current_chunk:
        merged_subtitle = merge(current_chunk, chunk_idx)
        merged_subtitles.append(merged_subtitle)

    return merged_subtitles


subtitle_collections = [merge_subtitles(transcript) for transcript in transcripts]

# get strings as well for quick checks (and easier to write to files)
subtitle_strings = [srt.compose(merged_subtitles) for merged_subtitles in subtitle_collections]

base_url_format = "https://www.youtube.com/watch?v={id}"
query_params_format = "&t={start}s"


def create_split_video_df(subtitles, base_url):
    rows = []
    for subtitle in subtitles:
        raw_text = subtitle.content
        text = raw_text.strip()
        start = timestamp_from_timedelta(subtitle.start)
        url = base_url + query_params_format.format(start=start)

        rows.append({"text": text, "source": url})

    video_df = pd.DataFrame.from_records(rows)
    return video_df


def timestamp_from_timedelta(td):
    return int(td.total_seconds())


split_video_dfs = [
    create_split_video_df(subtitles, base_url_format.format(id=video_id))
    for subtitles, video_id in zip(subtitle_collections, videos)
]

split_video_df = pd.concat(split_video_dfs, ignore_index=True)

In [4]:
documents_json = split_video_df.to_json(orient="index")

with open("documents.json", "w") as f:
    f.write(documents_json)

# Create embeddings and load to vector db

In [1]:
import json

with open("documents.json") as f:
    s = f.read()
    
json_data = json.loads(s).values()

sentences = []
for d in json_data:
    sentences.append(d["text"])

In [4]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
embs = model.encode(sentences[:3], convert_to_tensor=True)

In [46]:
from qdrant_client import QdrantClient

client = QdrantClient("localhost", port=6333)

In [127]:
from qdrant_client.http.models import Distance, VectorParams

collection = "MIT6.824"

client.create_collection(
    collection_name=collection,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

True

In [128]:
from qdrant_client.http.models import PointStruct
import uuid

def to_point(vector, payload):
    return PointStruct(id = str(uuid.uuid4()), vector = vector, payload = payload)

def create_points(vectors, data):
    points = []
    for i, item in enumerate(data):
        points.append(to_point(vectors[i], item))
    return points

points = create_points(embs, list(json_data))

In [129]:

operation_info = client.upsert(
    collection_name=collection,
    wait=True,
    points=points,
)

operation_info

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

# Querying data

In [135]:
query = "mapreduce model"
query_embedding = model.encode(query, convert_to_tensor=True)

search_result = client.search(
    collection_name=collection, query_vector=query_embedding, limit=3
)

for r in search_result:
    print(r.payload)

{'source': 'https://www.youtube.com/watch?v=cQP8WApzIQQ&t=3003s', 'text': "and the idea is that the programmer just write the application designer consumer of this distributed computation I'm just be able to write a simple map function and a simple reduce function that don't know anything about distribution and the MapReduce framework would take care of everything else so an abstract view of how what MapReduce is up to is it starts by assuming that there's some input and the input is split up into some a whole bunch of different files or chunks in some way so we're imagining that no yeah you know input file one and put file two etc you know these inputs are maybe you know web pages crawled from the web or more likely sort of big files that contain many web each of which contains many web files crawl from the web all right and the way Map Reduce"}
{'source': 'https://www.youtube.com/watch?v=cQP8WApzIQQ&t=3064s', 'text': "starts is that you're to find a map function and the MapReduce fra

In [133]:
# query = "mapreduce model"
# query_embedding = model.encode(query, convert_to_tensor=True)

# cos_scores = util.cos_sim(query_embedding, embs).squeeze()
# top_results = torch.topk(cos_scores, k=3)
# print("Query:", query)
# print("\nTop 5 most similar sentences in corpus:")

# for score, idx in zip(top_results[0], top_results[1]):
#     print(list(json_data)[idx], "(Score: {:.4f})".format(score))

Query: mapreduce model

Top 5 most similar sentences in corpus:
{'text': "and the idea is that the programmer just write the application designer consumer of this distributed computation I'm just be able to write a simple map function and a simple reduce function that don't know anything about distribution and the MapReduce framework would take care of everything else so an abstract view of how what MapReduce is up to is it starts by assuming that there's some input and the input is split up into some a whole bunch of different files or chunks in some way so we're imagining that no yeah you know input file one and put file two etc you know these inputs are maybe you know web pages crawled from the web or more likely sort of big files that contain many web each of which contains many web files crawl from the web all right and the way Map Reduce", 'source': 'https://www.youtube.com/watch?v=cQP8WApzIQQ&t=3003s'} (Score: 0.6638)
{'text': "starts is that you're to find a map function and th