In [None]:
!pip install -q nltk pandas chromadb scikit-learn openai-whisper gradio sentence-transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/800.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/800.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sqlite3
import pandas as pd
import zipfile
import io
import re
import nltk
import torch
import numpy as np
import whisper
import gradio as gr
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import chromadb

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))


Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# ----- DATA PROCESSING FUNCTIONS -----

def extract_subtitles(db_path, sample_fraction=0.3):
    """Extract subtitle data from SQLite database with optional sampling"""
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("SELECT * FROM zipfiles", conn)
    conn.close()

    def decode_method(binary_data):
        try:
            with io.BytesIO(binary_data) as f:
                with zipfile.ZipFile(f, 'r') as zip_file:
                    subtitle_content = zip_file.read(zip_file.namelist()[0])
            return subtitle_content.decode('latin-1')
        except Exception as e:
            return ""

    df['file_content'] = df['content'].apply(decode_method)
    df = df.sample(n=int(len(df) * sample_fraction), random_state=42)
    return df[['num', 'name', 'file_content']]

def clean_text(text):
    """Clean and preprocess text data"""
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    text = ' '.join([word for word in words if word not in stop_words])
    return text

def chunk_text(text, chunk_size=500, overlap=50):
    """Split text into overlapping chunks to preserve context"""
    words = word_tokenize(text)
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = words[start:end]
        chunks.append(" ".join(chunk))
        start += chunk_size - overlap
    return chunks


In [None]:
# ----- AUDIO PROCESSING FUNCTIONS -----

def audio_to_text(audio_path):
    """Convert audio file to text using Whisper model"""
    try:
        model = whisper.load_model("base")
        result = model.transcribe(audio_path)
        return result["text"].strip()
    except Exception as e:
        print(f"Error in audio transcription: {e}")
        return ""

In [None]:
# ----- VECTORIZATION AND SEARCH FUNCTIONS -----

def compute_tfidf(subtitles_df):
    """Compute TF-IDF vectors for subtitle content"""
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(subtitles_df['file_content'])
    return vectorizer, tfidf_matrix

def store_embeddings_in_chromadb(subtitles_df):
    """Process subtitle data and store embeddings in ChromaDB"""
    print("Initializing ChromaDB...")
    client = chromadb.PersistentClient(path="./chroma_subtitle_db")
    collection = client.get_or_create_collection(name="subtitle_embeddings")

    print("Loading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda' if torch.cuda.is_available() else 'cpu')

    print("Cleaning text...")
    subtitles_df['cleaned_content'] = subtitles_df['file_content'].apply(clean_text)

    batch_size = 100
    print(f"Processing {len(subtitles_df)} subtitles in batches of {batch_size}...")

    for start in range(0, len(subtitles_df), batch_size):
        print(f"Processing batch {start} to {start + batch_size}...")
        end = start + batch_size
        batch = subtitles_df.iloc[start:end]

        all_chunks, all_ids, all_metadatas = [], [], []

        for num, name, content in zip(batch['num'], batch['name'], batch['cleaned_content']):
            chunks = chunk_text(content)
            all_chunks.extend(chunks)
            all_ids.extend([f"{num}_{i}" for i in range(len(chunks))])
            all_metadatas.extend([{"name": name, "content": chunk} for chunk in chunks])

        print(f"Encoding {len(all_chunks)} chunks...")
        with torch.no_grad():
            embeddings = model.encode(all_chunks, batch_size=32, show_progress_bar=True).tolist()

        print("Adding embeddings to ChromaDB...")
        collection.add(
            ids=all_ids,
            embeddings=embeddings,
            metadatas=all_metadatas
        )

    print("Embedding storage complete.")
    return collection

def search_subtitles(audio_path, collection):
    """Search for matching subtitles based on audio input"""
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda' if torch.cuda.is_available() else 'cpu')
    query_text = audio_to_text(audio_path)
    query_embedding = model.encode([query_text]).tolist()
    results = collection.query(query_embeddings=query_embedding, n_results=5)

    formatted_results = []
    for i, (ids_list, metadatas_list, distances_list) in enumerate(zip(results['ids'], results['metadatas'], results['distances'])):
        for j in range(len(ids_list)):
            snippet = metadatas_list[j]['content'][:200] + "..." if len(metadatas_list[j]['content']) > 200 else metadatas_list[j]['content']
            formatted_results.append(f"Result {len(formatted_results)+1}: {metadatas_list[j]['name']}\nSnippet: {snippet}\nScore: {distances_list[j]:.4f}\n")
            if len(formatted_results) >= 5:
                break
        if len(formatted_results) >= 5:
            break

    return "\n".join(formatted_results)

In [None]:
# ----- MAIN EXECUTION FUNCTIONS -----

def initialize_system(db_path):
    """Initialize the system by loading and preparing data"""
    print("Extracting subtitles from database...")
    subtitles_df = extract_subtitles(db_path)

    print("Storing embeddings in ChromaDB...")
    collection = store_embeddings_in_chromadb(subtitles_df)

    print("System initialized successfully!")
    return collection

def gradio_search_interface(audio_file, collection_obj):
    """Function for Gradio interface to handle audio search"""
    if audio_file is None:
        return "Please upload an audio file."

    temp_audio_path = "temp_audio.mp3"
    with open(temp_audio_path, "wb") as f:
        f.write(audio_file)

    results = search_subtitles(temp_audio_path, collection_obj)

    if not results:
        return "No matching subtitles found."
    return results

In [None]:
# ----- MAIN EXECUTION -----

db_path = "/content/drive/My Drive/Datasets and Models/datasets/eng_subtitles_database.db"

collection = initialize_system(db_path)


Extracting subtitles from database...
Storing embeddings in ChromaDB...
Initializing ChromaDB...
Loading embedding model...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Cleaning text...
Processing 24749 subtitles in batches of 100...
Processing batch 0 to 100...
Encoding 675 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 100 to 200...
Encoding 615 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 200 to 300...
Encoding 608 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 300 to 400...
Encoding 645 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 400 to 500...
Encoding 677 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 500 to 600...
Encoding 726 chunks...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 600 to 700...
Encoding 639 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 700 to 800...
Encoding 657 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 800 to 900...
Encoding 663 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 900 to 1000...
Encoding 633 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1000 to 1100...
Encoding 647 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1100 to 1200...
Encoding 632 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1200 to 1300...
Encoding 658 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1300 to 1400...
Encoding 650 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1400 to 1500...
Encoding 668 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1500 to 1600...
Encoding 663 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1600 to 1700...
Encoding 587 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1700 to 1800...
Encoding 624 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1800 to 1900...
Encoding 635 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 1900 to 2000...
Encoding 598 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2000 to 2100...
Encoding 610 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2100 to 2200...
Encoding 602 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2200 to 2300...
Encoding 691 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2300 to 2400...
Encoding 629 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2400 to 2500...
Encoding 665 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2500 to 2600...
Encoding 613 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2600 to 2700...
Encoding 648 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2700 to 2800...
Encoding 649 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2800 to 2900...
Encoding 656 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 2900 to 3000...
Encoding 627 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3000 to 3100...
Encoding 630 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3100 to 3200...
Encoding 611 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3200 to 3300...
Encoding 640 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3300 to 3400...
Encoding 601 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3400 to 3500...
Encoding 655 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3500 to 3600...
Encoding 627 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3600 to 3700...
Encoding 615 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3700 to 3800...
Encoding 599 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3800 to 3900...
Encoding 589 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 3900 to 4000...
Encoding 654 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4000 to 4100...
Encoding 639 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4100 to 4200...
Encoding 635 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4200 to 4300...
Encoding 647 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4300 to 4400...
Encoding 649 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4400 to 4500...
Encoding 680 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4500 to 4600...
Encoding 666 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4600 to 4700...
Encoding 610 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4700 to 4800...
Encoding 676 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4800 to 4900...
Encoding 627 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 4900 to 5000...
Encoding 588 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5000 to 5100...
Encoding 671 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5100 to 5200...
Encoding 625 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5200 to 5300...
Encoding 664 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5300 to 5400...
Encoding 623 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5400 to 5500...
Encoding 645 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5500 to 5600...
Encoding 614 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5600 to 5700...
Encoding 696 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5700 to 5800...
Encoding 645 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5800 to 5900...
Encoding 644 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 5900 to 6000...
Encoding 645 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6000 to 6100...
Encoding 644 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6100 to 6200...
Encoding 623 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6200 to 6300...
Encoding 662 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6300 to 6400...
Encoding 595 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6400 to 6500...
Encoding 708 chunks...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6500 to 6600...
Encoding 595 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6600 to 6700...
Encoding 618 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6700 to 6800...
Encoding 592 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6800 to 6900...
Encoding 604 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 6900 to 7000...
Encoding 655 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7000 to 7100...
Encoding 628 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7100 to 7200...
Encoding 589 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7200 to 7300...
Encoding 578 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7300 to 7400...
Encoding 594 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7400 to 7500...
Encoding 619 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7500 to 7600...
Encoding 590 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7600 to 7700...
Encoding 637 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7700 to 7800...
Encoding 656 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7800 to 7900...
Encoding 594 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 7900 to 8000...
Encoding 676 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8000 to 8100...
Encoding 786 chunks...


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8100 to 8200...
Encoding 618 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8200 to 8300...
Encoding 652 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8300 to 8400...
Encoding 617 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8400 to 8500...
Encoding 630 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8500 to 8600...
Encoding 669 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8600 to 8700...
Encoding 661 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8700 to 8800...
Encoding 647 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8800 to 8900...
Encoding 664 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 8900 to 9000...
Encoding 632 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 9000 to 9100...
Encoding 603 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 9100 to 9200...
Encoding 627 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 9200 to 9300...
Encoding 658 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 9300 to 9400...
Encoding 682 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 9400 to 9500...
Encoding 638 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 9500 to 9600...
Encoding 632 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 9600 to 9700...
Encoding 626 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 9700 to 9800...
Encoding 668 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 9800 to 9900...
Encoding 683 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 9900 to 10000...
Encoding 642 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 10000 to 10100...
Encoding 680 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 10100 to 10200...
Encoding 612 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 10200 to 10300...
Encoding 559 chunks...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 10300 to 10400...
Encoding 638 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 10400 to 10500...
Encoding 670 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 10500 to 10600...
Encoding 655 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 10600 to 10700...
Encoding 633 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 10700 to 10800...
Encoding 601 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 10800 to 10900...
Encoding 642 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 10900 to 11000...
Encoding 636 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 11000 to 11100...
Encoding 615 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 11100 to 11200...
Encoding 595 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 11200 to 11300...
Encoding 693 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 11300 to 11400...
Encoding 705 chunks...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 11400 to 11500...
Encoding 673 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 11500 to 11600...
Encoding 628 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 11600 to 11700...
Encoding 639 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 11700 to 11800...
Encoding 634 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 11800 to 11900...
Encoding 691 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 11900 to 12000...
Encoding 610 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 12000 to 12100...
Encoding 644 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 12100 to 12200...
Encoding 667 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 12200 to 12300...
Encoding 649 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 12300 to 12400...
Encoding 633 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 12400 to 12500...
Encoding 629 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 12500 to 12600...
Encoding 684 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 12600 to 12700...
Encoding 634 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 12700 to 12800...
Encoding 601 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 12800 to 12900...
Encoding 616 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 12900 to 13000...
Encoding 635 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 13000 to 13100...
Encoding 643 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 13100 to 13200...
Encoding 621 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 13200 to 13300...
Encoding 660 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 13300 to 13400...
Encoding 608 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 13400 to 13500...
Encoding 656 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 13500 to 13600...
Encoding 611 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 13600 to 13700...
Encoding 588 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 13700 to 13800...
Encoding 570 chunks...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 13800 to 13900...
Encoding 678 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 13900 to 14000...
Encoding 636 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 14000 to 14100...
Encoding 652 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 14100 to 14200...
Encoding 551 chunks...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 14200 to 14300...
Encoding 614 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 14300 to 14400...
Encoding 606 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 14400 to 14500...
Encoding 629 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 14500 to 14600...
Encoding 625 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 14600 to 14700...
Encoding 647 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 14700 to 14800...
Encoding 650 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 14800 to 14900...
Encoding 715 chunks...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 14900 to 15000...
Encoding 698 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 15000 to 15100...
Encoding 635 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 15100 to 15200...
Encoding 651 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 15200 to 15300...
Encoding 694 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 15300 to 15400...
Encoding 636 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 15400 to 15500...
Encoding 638 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 15500 to 15600...
Encoding 641 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 15600 to 15700...
Encoding 683 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 15700 to 15800...
Encoding 699 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 15800 to 15900...
Encoding 596 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 15900 to 16000...
Encoding 618 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 16000 to 16100...
Encoding 586 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 16100 to 16200...
Encoding 672 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 16200 to 16300...
Encoding 621 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 16300 to 16400...
Encoding 657 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 16400 to 16500...
Encoding 667 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 16500 to 16600...
Encoding 613 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 16600 to 16700...
Encoding 616 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 16700 to 16800...
Encoding 628 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 16800 to 16900...
Encoding 622 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 16900 to 17000...
Encoding 663 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 17000 to 17100...
Encoding 627 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 17100 to 17200...
Encoding 553 chunks...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 17200 to 17300...
Encoding 636 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 17300 to 17400...
Encoding 650 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 17400 to 17500...
Encoding 617 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 17500 to 17600...
Encoding 578 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 17600 to 17700...
Encoding 667 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 17700 to 17800...
Encoding 682 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 17800 to 17900...
Encoding 593 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 17900 to 18000...
Encoding 573 chunks...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 18000 to 18100...
Encoding 691 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 18100 to 18200...
Encoding 683 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 18200 to 18300...
Encoding 610 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 18300 to 18400...
Encoding 573 chunks...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 18400 to 18500...
Encoding 625 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 18500 to 18600...
Encoding 674 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 18600 to 18700...
Encoding 579 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 18700 to 18800...
Encoding 617 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 18800 to 18900...
Encoding 644 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 18900 to 19000...
Encoding 669 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 19000 to 19100...
Encoding 622 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 19100 to 19200...
Encoding 691 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 19200 to 19300...
Encoding 620 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 19300 to 19400...
Encoding 640 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 19400 to 19500...
Encoding 665 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 19500 to 19600...
Encoding 572 chunks...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 19600 to 19700...
Encoding 630 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 19700 to 19800...
Encoding 666 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 19800 to 19900...
Encoding 623 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 19900 to 20000...
Encoding 582 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 20000 to 20100...
Encoding 626 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 20100 to 20200...
Encoding 613 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 20200 to 20300...
Encoding 586 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 20300 to 20400...
Encoding 623 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 20400 to 20500...
Encoding 632 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 20500 to 20600...
Encoding 685 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 20600 to 20700...
Encoding 648 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 20700 to 20800...
Encoding 687 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 20800 to 20900...
Encoding 609 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 20900 to 21000...
Encoding 653 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 21000 to 21100...
Encoding 625 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 21100 to 21200...
Encoding 612 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 21200 to 21300...
Encoding 614 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 21300 to 21400...
Encoding 557 chunks...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 21400 to 21500...
Encoding 637 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 21500 to 21600...
Encoding 623 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 21600 to 21700...
Encoding 661 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 21700 to 21800...
Encoding 680 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 21800 to 21900...
Encoding 627 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 21900 to 22000...
Encoding 607 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 22000 to 22100...
Encoding 598 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 22100 to 22200...
Encoding 662 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 22200 to 22300...
Encoding 652 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 22300 to 22400...
Encoding 613 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 22400 to 22500...
Encoding 674 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 22500 to 22600...
Encoding 568 chunks...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 22600 to 22700...
Encoding 667 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 22700 to 22800...
Encoding 609 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 22800 to 22900...
Encoding 642 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 22900 to 23000...
Encoding 634 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 23000 to 23100...
Encoding 688 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 23100 to 23200...
Encoding 680 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 23200 to 23300...
Encoding 595 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 23300 to 23400...
Encoding 693 chunks...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 23400 to 23500...
Encoding 658 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 23500 to 23600...
Encoding 657 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 23600 to 23700...
Encoding 606 chunks...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 23700 to 23800...
Encoding 659 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 23800 to 23900...
Encoding 612 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 23900 to 24000...
Encoding 651 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 24000 to 24100...
Encoding 628 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 24100 to 24200...
Encoding 627 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 24200 to 24300...
Encoding 618 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 24300 to 24400...
Encoding 570 chunks...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 24400 to 24500...
Encoding 733 chunks...


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 24500 to 24600...
Encoding 614 chunks...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 24600 to 24700...
Encoding 663 chunks...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Processing batch 24700 to 24800...
Encoding 322 chunks...


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Adding embeddings to ChromaDB...
Embedding storage complete.
System initialized successfully!
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3800f2778807feb4b7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
def launch_gradio_interface(collection):
    """Create and launch Gradio interface"""
    with gr.Blocks(title="Subtitle Shazam") as demo:
        gr.Markdown("# 🎬 Subtitle Shazam")
        gr.Markdown("Upload an audio clip to find matching movie or TV show subtitles")

        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(type="filepath", label="Upload Audio")
                search_button = gr.Button("Search for Matching Subtitles", variant="primary")

            with gr.Column():
                results_output = gr.Textbox(label="Search Results", lines=20)

        search_button.click(
            fn=lambda x: gradio_search_interface(x, collection),
            inputs=audio_input,
            outputs=results_output
        )

    demo.launch(share=True)

launch_gradio_interface(collection)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://68396d35ea4409ce5c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
