**Enhancing Search Engine Relevance for Video Subtitles**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sqlite3
import zipfile
import io

# Connect to the subtitles database
db_path = r'/content/drive/MyDrive/Copy of eng_subtitles_database.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Fetch a single subtitle record
cursor.execute("SELECT num, name, content FROM zipfiles LIMIT 1")
num, name, content = cursor.fetchone()

# Extract .srt from zipped binary
zip_file = zipfile.ZipFile(io.BytesIO(content))
srt_file = zip_file.namelist()[0]  # Assume 1 file per zip
text = zip_file.read(srt_file).decode('utf-8', errors='ignore')

print(f"Subtitle ID: {num}\nFilename: {name}\nText Preview:\n{text[:1000]}")


Subtitle ID: 9180533
Filename: the.message.(1976).eng.1cd
Text Preview:
1
00:00:06,000 --> 00:00:12,074
Watch any video online with Open-SUBTITLES
Free Browser extension: osdb.link/ext

2
00:02:26,198 --> 00:02:29,953
In the name of God, the most gracious, the most Merciful.

3
00:02:31,072 --> 00:02:33,370
From Muhammad, the Messenger of God

4
00:02:33,550 --> 00:02:36,047
to Heraclius, the emperor of Byzantium.

5
00:02:36,407 --> 00:02:39,464
greetings to him who is the
follower of righteous guidance.

6
00:02:39,783 --> 00:02:42,591
I bid you to hear the divine call.

7
00:02:43,160 --> 00:02:45,817
I am the messenger of God to the people;

8
00:02:46,337 --> 00:02:48,784
accept Islam for your salvation.

9
00:02:52,231 --> 00:02:54,709
He speaks of a new prophet in Arabia.

10
00:02:55,068 --> 00:02:57,825
Was it like this when John, the Baptist
came to king Herod

11
00:02:58,145 --> 00:03:01,272
out of the desert, crying about salvat

Data preprocessing

In [None]:
import re

def clean_subtitle_text(raw_text):
    text = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', raw_text)
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'http\S+|www\S+|osdb\.link\S+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

cleaned_text = clean_subtitle_text(text)
print(cleaned_text[:1000])


watch any video online with open-subtitles free browser extension: in the name of god, the most gracious, the most merciful. from muhammad, the messenger of god to heraclius, the emperor of byzantium. greetings to him who is the follower of righteous guidance. i bid you to hear the divine call. i am the messenger of god to the people; accept islam for your salvation. he speaks of a new prophet in arabia. was it like this when john, the baptist came to king herod out of the desert, crying about salvation? to muqawqis, patriarch of alexandria. kisra, emperor of persia. muhammad calls you with the call of god. accept islam for your salvation... embrace islam. you come out of the desert, smelling of camel and goat. to tell persia where he should kneel? muhammad, messenger of god. who gave him this authority? god sent muhammad as a mercy to mankind. the scholars and historians of islam - the university of al-azhar in cairo the high islamic congress of the shiat in lebanon the makers of this

In [None]:
def chunk_text(text, chunk_size=50, overlap=10):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        if len(chunk.split()) >= 10:
            chunks.append(chunk)
    return chunks

chunks = chunk_text(cleaned_text)
print(f"Chunks: {len(chunks)}\n\nSample:\n{chunks[0]}")

Chunks: 227

Sample:
watch any video online with open-subtitles free browser extension: in the name of god, the most gracious, the most merciful. from muhammad, the messenger of god to heraclius, the emperor of byzantium. greetings to him who is the follower of righteous guidance. i bid you to hear the divine call.


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
chunk_embeddings = model.encode(chunks, show_progress_bar=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
pip install chromadb



In [None]:
import chromadb

# Use the new default in-memory client (sufficient for local/dev)
client = chromadb.Client()

# Create collection
collection = client.get_or_create_collection("subtitle_chunks")

# Add chunks and embeddings
ids = [f"chunk_{i}" for i in range(len(chunks))]

collection.add(
    documents=chunks,
    ids=ids,
    embeddings=chunk_embeddings
)

print("✅ Chunks added to in-memory ChromaDB")



✅ Chunks added to in-memory ChromaDB


In [None]:
!pip install whisper
!pip install openai-whisper

import whisper

# Load Whisper model
whisper_model = whisper.load_model("base")

# Transcribe audio file
result = whisper_model.transcribe(r"/content/Rags & Retrieval Systm.mp3")
transcribed_text = result['text']
print("🎧 Transcribed:\n", transcribed_text)





🎧 Transcribed:
  You know Marilyn by now. Nothing should surprise me around here, but someone seems to have left a step ladder standing out there in the middle of the street. One of those really tall ones like 12 feet tall maybe a backslash is just standing.


In [None]:
query_cleaned = clean_subtitle_text(transcribed_text)
print("🧹 Cleaned Query:\n", query_cleaned)
query_embedding = model.encode([query_cleaned])[0]


🧹 Cleaned Query:
 you know marilyn by now. nothing should surprise me around here, but someone seems to have left a step ladder standing out there in the middle of the street. one of those really tall ones like 12 feet tall maybe a backslash is just standing.


In [None]:
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5  # Top 5 closest chunks
)

# Display results
print("🔍 Top Matching Subtitle Chunks:\n")
for doc, dist in zip(results['documents'][0], results['distances'][0]):
    print(f"[Score: {1 - dist:.4f}] {doc}\n")

🔍 Top Matching Subtitle Chunks:

[Score: -0.9673] Heraklius was a Byzantine Emperor who received the Prophet’s letter.

[Score: -0.9804] The letter to Heraklius spoke about the oneness of God and guidance.

[Score: -1.0103] The Prophet Muhammad sent letters to rulers inviting them to Islam.

[Score: -1.0110] Studying old letters helps us understand history better.

[Score: -1.0310] Islamic history contains many diplomatic correspondences.



In [None]:
! pip install streamlit



In [None]:
%%writefile app5.py
import streamlit as st
import whisper
import chromadb
from sentence_transformers import SentenceTransformer

# ---- Page Config ----
st.set_page_config(page_title="EchoScribe 🎙", page_icon="🎧", layout="wide")

# ---- Custom Title Styling ----
st.markdown(
    """
    <div style="text-align: center;">
        <h1 style="color: #FF4B4B; font-size: 48px; font-weight: bold; font-style: italic;">
            EchoScribe 🎙
        </h1>
    </div>
    """,
    unsafe_allow_html=True
)

# ---- Sidebar for Upload ----
st.sidebar.header("⚙️ Settings")
uploaded_file = st.sidebar.file_uploader("📂 Upload an Audio File", type=["wav", "mp3", "m4a"])

# ---- Load Whisper Model ----
@st.cache_resource
def load_whisper():
    return whisper.load_model("base")

whisper_model = load_whisper()

# ---- Load Sentence Transformer Model ----
@st.cache_resource
def load_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

embedding_model = load_embedding_model()

# ---- Initialize ChromaDB ----
chroma_client = chromadb.PersistentClient(path="chroma_db")
collection = chroma_client.get_or_create_collection(name="subtitles")

# ---- Sample Subtitle Data ----
subtitles = [
    "The Prophet Muhammad sent letters to rulers inviting them to Islam.",
    "Heraklius was a Byzantine Emperor who received the Prophet’s letter.",
    "The letter to Heraklius spoke about the oneness of God and guidance.",
    "Islamic history contains many diplomatic correspondences.",
    "Studying old letters helps us understand history better."
]

# Store subtitles in ChromaDB with embeddings (Avoid re-adding)
if collection.count() == 0:
    for i, subtitle in enumerate(subtitles):
        embedding = embedding_model.encode(subtitle).tolist()
        collection.add(ids=[str(i)], embeddings=[embedding], documents=[subtitle])

# ---- Process Uploaded File ----
if uploaded_file:
    with open("temp_audio.m4a", "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.audio(uploaded_file, format='audio/mpeg')

    with st.spinner("⏳ Transcribing... Please wait!"):
        result = whisper_model.transcribe("temp_audio.m4a")
        transcribed_text = result['text']

    # ---- Display Transcribed Text in an Expander ----
    with st.expander("🎧 Transcribed Text", expanded=True):
        st.markdown(f"<p style='background:#f1f3f4; padding:10px; border-radius:10px;'>{transcribed_text}</p>", unsafe_allow_html=True)

    # ---- Query Matching Subtitles ----
    query_embedding = embedding_model.encode([transcribed_text])[0].tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=5)

    # ---- Display Matched Subtitles in a Styled Table ----
    st.markdown("### 🎯 Best Matched Subtitle Segments")
    if results['documents'][0]:
        match_data = [{"Score": f"{1 - dist:.4f}", "Subtitle": doc} for doc, dist in zip(results['documents'][0], results['distances'][0])]
        st.table(match_data)
    else:
        st.warning("No matching subtitles found.")


Writing app5.py


In [None]:
!npm install localtunnel
!streamlit run /content/app5.py &>/content/logs.txt &
!npx localtunnel --port 8501 & curl ipv4.icanhazip.com

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K
added 22 packages in 2s
[1G[0K⠴[1G[0K
[1G[0K⠦[1G[0K3 packages are looking for funding
[1G[0K⠦[1G[0K  run `npm fund` for details
[1G[0K⠦[1G[0K35.199.19.155
[1G[0K⠙[1G[0K⠹[1G[0Kyour url is: https://kind-mammals-taste.loca.lt
