In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install chromadb sentence-transformers openai-whisper faiss-cpu


In [None]:
import sqlite3
import zipfile
import io
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import chromadb
import whisper
import faiss
import sqlite3


In [None]:
# File Paths
DB_PATH = "/content/drive/MyDrive/Colab Notebooks/Major/eng_subtitles_database.db"#database path
AUDIO_PATH = "/content/drive/MyDrive/Colab Notebooks/Major/query_audio.mp3"#input audio file path

In [None]:
def load_subtitles(db_path, sample_fraction=0.3):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT num, name, content FROM zipfiles")
    rows = cursor.fetchall()
    conn.close()

    subtitles = []
    for num, name, content in rows:
        try:
            # Decompress ZIP content
            with zipfile.ZipFile(io.BytesIO(content), 'r') as z:
                for filename in z.namelist():
                    with z.open(filename) as f:
                        text = f.read().decode('latin-1')  # Decode properly
                        subtitles.append({'id': num, 'name': name, 'content': text})
        except Exception as e:
            print(f"Error extracting {name}: {e}")
            continue

    df = pd.DataFrame(subtitles)
    return df.sample(frac=sample_fraction, random_state=42)  # Reduce dataset to 30%


In [None]:
# Preprocessing subtitles
def clean_subtitles(text):
    text = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', text)  # Remove timestamps
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
     # Remove subtitle line numbers
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [None]:
# Document Chunking
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

In [None]:
# Convert to TF-IDF
def tfidf_vectorization(docs):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(docs)
    return vectorizer, vectors

In [None]:
# Convert to SentenceTransformer Embeddings
def embed_sentences(sentences, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return model, embeddings

In [None]:
# Store embeddings in FAISS
def store_embeddings_faiss(embeddings):
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    return index


In [None]:
# Process Audio Query
def transcribe_audio(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result['text']

In [None]:
# Perform FAISS Search
def search_query_faiss(query, model, index, embeddings, docs):
    query_embedding = model.encode([query])
    faiss.normalize_L2(query_embedding)
    _, indices = index.search(query_embedding, 5)  # Top 5 results
    return [docs[i] for i in indices[0]]

In [None]:
# Main Execution
if __name__ == "__main__":
    # Printing Output Headers
    def print_section(title, icon):
        print("\n==============================")
        print(f"{icon} \033[1m{title}\033[0m")
        print("==============================\n")

    print_section("Loading Subtitles...", "📌")
    df = load_subtitles(DB_PATH)
    df['clean_content'] = df['content'].apply(clean_subtitles)

    print_section("Applying Text Chunking...", "✂️")
    df['chunks'] = df['clean_content'].apply(lambda x: chunk_text(x))
    df = df.explode('chunks').reset_index(drop=True)

    print_section("Generating BERT Embeddings...", "🧠")
    model, embeddings = embed_sentences(df['chunks'].tolist())

    print_section("Storing Embeddings in FAISS...", "💾")
    faiss_index = store_embeddings_faiss(embeddings.cpu().numpy())

    print_section("Processing Audio Query...", "🎙️")
    query_text = transcribe_audio(AUDIO_PATH)


In [None]:
 # Step 6: Perform Search
print_section("Performing Search...", "🔍")
results = search_query_faiss(query_text, model, faiss_index, embeddings.cpu().numpy(), df['chunks'].tolist())
print("🔎 Search Completed! Top Results:\n")
for i, res in enumerate(results[:5]):
    print(f"{i+1}. {res}\n")