#Enhancing Search Engine Relevance for Video Subtitles (Cloning Shazam)


---



In [1]:
!pip install chromadb sentence-transformers



In [2]:
import sqlite3
import pandas as pd
import numpy as np
import chromadb
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
from google.colab import drive
drive.mount('/content/drive')
db_path = "/content/drive/MyDrive/Colab Notebooks/eng_subtitles_database.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Load subtitle data
query = "SELECT num, name, content FROM zipfiles"
data = pd.read_sql_query(query, conn)
conn.close()

In [5]:
# Decode binary content
data['content'] = data['content'].apply(lambda x: x.decode('latin-1'))


In [6]:
# Preprocessing: Remove timestamps and special characters
def clean_text(text):
    text = text.replace('\n', ' ')  # Remove newlines
    text = ' '.join([word for word in text.split() if '-->' not in word])  # Remove timestamps
    return text
data['cleaned_content'] = data['content'].apply(clean_text)

In [7]:
# Sample 30% of data for resource efficiency
data = data.sample(frac=0.3, random_state=42)


In [8]:
# TF-IDF Vectorization for Keyword Search
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_content'])

In [9]:
# BERT Model for Semantic Search
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
data['embeddings'] = data['cleaned_content'].apply(lambda x: bert_model.encode(x, convert_to_numpy=True))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
# Document Chunking to prevent information loss
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

data['chunks'] = data['cleaned_content'].apply(chunk_text)

In [None]:
# Store Embeddings in ChromaDB
client = chromadb.PersistentClient(path="/content/chromadb_store")
db = client.create_collection("subtitle_search")

for index, row in data.iterrows():
    for chunk in row['chunks']:
        embedding = bert_model.encode(chunk, convert_to_numpy=True).tolist()
        db.add(ids=[str(row['num'])], embeddings=[embedding], metadatas=[{"name": row['name'], "content": chunk}])





[1;30;43mStreaming output truncated to the last 5000 lines.[0m


KeyboardInterrupt: 

In [None]:
# Function for Search Query
def search_subtitles(query, method='semantic'):
    query_embedding = bert_model.encode(query, convert_to_numpy=True)

    if method == 'semantic':
        results = db.query(query_embeddings=[query_embedding.tolist()], n_results=5)
        return [r["metadata"] for r in results["documents"]]
    else:
        query_vec = tfidf_vectorizer.transform([query])
        scores = cosine_similarity(query_vec, tfidf_matrix)
        top_indices = scores.argsort()[0][-5:][::-1]
        return data.iloc[top_indices][['name', 'cleaned_content']].to_dict(orient='records')

In [None]:
# Example Usage
query = "A detective investigating a crime"
print("Semantic Search Results:", search_subtitles(query, method='semantic'))
print("Keyword Search Results:", search_subtitles(query, method='keyword'))

In [8]:
query_embedding = bert_model.encode(query, convert_to_numpy=True)
results = db.query(query_embeddings=[query_embedding.tolist()], n_results=5)


Available tables: [('zipfiles',), ('subtitles',), ('sqlite_sequence',)]
