In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install chromadb sentence-transformers openai-whisper faiss-cpu


Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Using cached faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [2]:
import sqlite3
import zipfile
import io
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import chromadb
import whisper
import faiss
import sqlite3


In [3]:
# File Paths
DB_PATH = "/content/drive/MyDrive/Colab Notebooks/Major/eng_subtitles_database.db"#database path
AUDIO_PATH = "/content/drive/MyDrive/Colab Notebooks/Major/query_audio.mp3"#input audio file path

In [4]:
def load_subtitles(db_path, sample_fraction=0.3):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT num, name, content FROM zipfiles")
    rows = cursor.fetchall()
    conn.close()

    subtitles = []
    for num, name, content in rows:
        try:
            # Decompress ZIP content
            with zipfile.ZipFile(io.BytesIO(content), 'r') as z:
                for filename in z.namelist():
                    with z.open(filename) as f:
                        text = f.read().decode('latin-1')  # Decode properly
                        subtitles.append({'id': num, 'name': name, 'content': text})
        except Exception as e:
            print(f"Error extracting {name}: {e}")
            continue

    df = pd.DataFrame(subtitles)
    return df.sample(frac=sample_fraction, random_state=42)  # Reduce dataset to 30%


In [5]:
# Preprocessing subtitles
def clean_subtitles(text):
    text = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', text)  # Remove timestamps
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
     # Remove subtitle line numbers
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [6]:
# Document Chunking
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

In [7]:
# Convert to TF-IDF
def tfidf_vectorization(docs):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(docs)
    return vectorizer, vectors

In [8]:
# Convert to SentenceTransformer Embeddings
def embed_sentences(sentences, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return model, embeddings

In [9]:
# Store embeddings in FAISS
def store_embeddings_faiss(embeddings):
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    return index


In [10]:
# Process Audio Query
def transcribe_audio(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result['text']

In [11]:
# Perform FAISS Search
def search_query_faiss(query, model, index, embeddings, docs):
    query_embedding = model.encode([query])
    faiss.normalize_L2(query_embedding)
    _, indices = index.search(query_embedding, 5)  # Top 5 results
    return [docs[i] for i in indices[0]]

In [12]:
# Main Execution
if __name__ == "__main__":
    # Printing Output Headers
    def print_section(title, icon):
        print("\n==============================")
        print(f"{icon} \033[1m{title}\033[0m")
        print("==============================\n")

    print_section("Loading Subtitles...", "📌")
    df = load_subtitles(DB_PATH)
    df['clean_content'] = df['content'].apply(clean_subtitles)

    print_section("Applying Text Chunking...", "✂️")
    df['chunks'] = df['clean_content'].apply(lambda x: chunk_text(x))
    df = df.explode('chunks').reset_index(drop=True)

    print_section("Generating BERT Embeddings...", "🧠")
    model, embeddings = embed_sentences(df['chunks'].tolist())

    print_section("Storing Embeddings in FAISS...", "💾")
    faiss_index = store_embeddings_faiss(embeddings.cpu().numpy())

    print_section("Processing Audio Query...", "🎙️")
    query_text = transcribe_audio(AUDIO_PATH)



📌 [1mLoading Subtitles...[0m


✂️ [1mApplying Text Chunking...[0m


🧠 [1mGenerating BERT Embeddings...[0m



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



💾 [1mStoring Embeddings in FAISS...[0m


🎙️ [1mProcessing Audio Query...[0m



In [13]:
 # Step 6: Perform Search
print_section("Performing Search...", "🔍")
results = search_query_faiss(query_text, model, faiss_index, embeddings.cpu().numpy(), df['chunks'].tolist())
print("🔎 Search Completed! Top Results:\n")
for i, res in enumerate(results[:5]):
    print(f"{i+1}. {res}\n")


🔍 [1mPerforming Search...[0m

🔎 Search Completed! Top Results:

1. our planet will vanish Ive known the future you people wont think of Soon your planet will be punished Ive seen things you people wont believe Soon our planet will vanish Ive known the future you people wont think of Soon your planet will be punished ThreeBody apiOpenSubtitlesorg is deprecated please implement REST API from OpenSubtitlescom

2. ideas and actions shape your future In ten minutes in 20 years it doesnt matter I only mean the determined form That is his greatest predisposition for life Yes I think so too I think were meant to meet certain people As in our case Yes It was kind of like fate yeah Yes exactly It was something like that Because it was supposed it was going to happen Its true There are many things we dont know do you understand We dont know what will happen tomorrow But it turns out It was already known that we would sit here and now when we were 21 years old It was necessary to move on But we