# Indexing

#### Imports

In [1]:
import re
import json
import subprocess

from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

from langchain.text_splitter import RecursiveCharacterTextSplitter

from youtube_transcript_api import YouTubeTranscriptApi

#### Components

In [2]:
# Embeddings Model
embeddings = OpenAIEmbeddings(model = 'text-embedding-3-large')

# Vector Store
vector_store = Chroma(
    persist_directory = '../chroma_db',
    embedding_function = embeddings
)

#### Fetch Youtube Transcripts

In [3]:
# Youtube Video URLs

video_urls = ['https://youtu.be/0rHUDWjR5gg', 'https://youtu.be/L-Wtlev6suc', 'https://youtu.be/01QWC-rZcfE', 
              'https://youtu.be/AQ5vty8f9Xc', 'https://youtu.be/PRgua7xceDA', 'https://youtu.be/mYhy7eaazIk', 
              'https://youtu.be/TRAbZxQHlVw', 'https://youtu.be/KlWpFLfLFBI', 'https://youtu.be/TKM0P3XlMNA', 
              'https://youtu.be/b22HKFMIfWo', 'https://youtu.be/w-9gDALvMF4', 'https://youtu.be/mCzchPx3yF8', 
              'https://youtu.be/P3GkZe3nRQ0', 'https://youtu.be/ZFUgy3crCYY', 'https://youtu.be/I-88YWx71gE', 
              'https://youtu.be/Xwn8fQSW7-8', 'https://youtu.be/HaFaf7vbgpE', 'https://youtu.be/E8GNde5nCSg', 
              'https://youtu.be/1hIwD17Crko', 'https://youtu.be/auxpcdQimCs', 'https://youtu.be/yB9HHyPpKds', 
              'https://youtu.be/ZJscxTyI__s', 'https://youtu.be/TuDfZ2Md5x8', 'https://youtu.be/jjy-eqWM38g', 
              'https://youtu.be/CWMh61yutjU', 'https://youtu.be/ld75W1dz-h0', 'https://youtu.be/7ATtD8x7vV0', 
              'https://youtu.be/4zKVx29_A1w', 'https://youtu.be/jfvMtCHv1q4', 'https://youtu.be/Mj06h8BeeOA', 
              'https://youtu.be/PWx9DurgPn8', 'https://youtu.be/RrMvUL8HFlM', 'https://youtu.be/qZWPBKULkdQ', 
              'https://youtu.be/pIFiCLhJmig', 'https://youtu.be/an4rgJ3O21A', 'https://youtu.be/W8UI7F43_Yk', 
              'https://youtu.be/tj_QPnO8vpQ', 'https://youtu.be/I82ADyJC7wE', 'https://youtu.be/_O2sg-PGhEg', 
              'https://youtu.be/Z2zA9nPFN5A', 'https://youtu.be/9W3RsaWuCuE', 'https://youtu.be/9B7Ix2VQEGo', 
              'https://youtu.be/gzLM6ltw3l0', 'https://youtu.be/IGCVTSQw7WU', 'https://youtu.be/jDF-N3A60DE', 
              'https://youtu.be/mgdq6DOTU3M', 'https://youtu.be/0ytyMKa8aps']

In [4]:
# Extract Youtube Video IDs

def extract_video_id(url):
    """Extracts the video ID from the YouTube URL"""
    pattern = r"youtu.be/([^?&]+)"
    match = re.search(pattern, url)
    return match[1]

video_ids = [extract_video_id(url) for url in video_urls]

In [5]:
# Extract Youtube Video Titles

def extract_video_title(url):
    """Extracts the title of the YouTube video"""
    command = ['yt-dlp', '--print-json', '--skip-download', url]
    result = subprocess.run(command, capture_output = True, text = True)
    video_info = json.loads(result.stdout).get('title')
    title = re.match(r"^[^:]+", video_info)
    return title.group()

video_titles = [extract_video_title(url) for url in video_urls]

In [6]:
# Fetch Youtube Video Transcripts

def get_youtube_transcript(video_id):
    """Fetches the transcript for the video."""
    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages = ['en'])
    full_transcript = ' '.join([entry['text'] for entry in transcript])
    return full_transcript

video_transcripts = []

for video_id in video_ids:
    transcript = ' '.join(get_youtube_transcript(video_id).splitlines())
    video_transcripts.append(transcript)

#### Splitting documents

In [7]:
# Corpus Initialization

corpus = []

for i in range(len(video_urls)):
    corpus.append({'id': i+1, 'title': video_titles[i], 'text': video_transcripts[i]})

In [8]:
# Splitting texts into chunks

splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100
)

split_corpus = []

for doc in corpus:
    chunks = splitter.split_text(doc['text'])
    for idx, chunk in enumerate(chunks):
        split_corpus.append({
            'id': f"{doc['id']}-{idx+1}",
            'title': doc['title'],
            'text': chunk
        })

In [None]:
print(split_corpus[0])

In [None]:
print(split_corpus[-1])

In [None]:
len(split_corpus)

#### Embed & Store in ChromaDB

In [None]:
# Prepare lists for batch insertion
texts = [doc['text'] for doc in split_corpus]
metadatas = [{'title': doc['title']} for doc in split_corpus]
ids = [str(doc['id']) for doc in split_corpus]

# Store in ChromaDB (auto-generate embeddings)
vector_store.add_texts(
    texts = texts,
    metadatas = metadatas,
    ids = ids
)

print(f"✅ Successfully stored {len(split_corpus)} documents in ChromaDB with embeddings!")

#### Confirm Storage

In [None]:
# Retrieve stored data
stored_data = vector_store._collection.get(include = ['metadatas', 'documents', 'embeddings'])

# Extract ids, titles, texts, embeddings
ids = stored_data.get('ids')
titles = stored_data.get('metadatas')
texts = stored_data.get('documents')
embeddings = stored_data.get('embeddings')

for i in range(3):
    print(f"ID: {ids[i]}")
    print(f"Title: {titles[i].get('title')}")
    print(f"Text: {texts[i]}")
    print(f"Embedding: {embeddings[i]}\n")