In [1]:
import pandas as pd
import numpy as np
import sqlite3
import warnings
from warnings import filterwarnings

In [2]:
conn = sqlite3.connect('eng_subtitles_database.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

[('zipfiles',)]


In [4]:
cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

num
name
content


In [5]:
df = pd.read_sql_query("""SELECT * FROM zipfiles""", conn)
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


In [6]:
data= df.sample(n=25000, random_state=42)

In [7]:
import zipfile
import io

count = 0

def decode_method(binary_data):
    global count
    # Decompress the binary data using the zipfile module
    # print(count, end=" ")
    count += 1
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            # Assuming there's only one file in the ZIP archive
            subtitle_content = zip_file.read(zip_file.namelist()[0])

    # Now 'subtitle_content' should contain the extracted subtitle content
    return subtitle_content.decode('latin-1')

In [8]:
data['file_content'] = data['content'].apply(decode_method)

data.head()

Unnamed: 0,num,name,content,file_content
17262,9251120,maybe.this.time.(2014).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x89\x9a\x...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."
7294,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,b'PK\x03\x04\x14\x00\x00\x00\x08\x007\x8f\x99V...,"1\r\n00:00:09,275 --> 00:00:11,876\r\n¶ Oh, I ..."
47707,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x8f\x19\x...,"1\r\n00:00:07,140 --> 00:00:14,220\r\n<i>Timin..."
29914,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00[\xaa\x99V...,"1\r\n00:00:06,133 --> 00:00:08,900\r\n[etherea..."
54266,9408707,battlebots.(2015).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xf4<\x9aV...,"ï»¿1\r\n00:00:01,480 --> 00:00:03,570\r\n[Chri..."


In [9]:
import re


In [10]:

def clean(doc): 
  doc = doc.replace("\r\n","")
  doc = re.sub(r"\d{2}:\d{2}:\d{2},\d{3}|.[^a-zA-Z0-9\s.,!?-]", " ", doc)  
  return doc

data['cleaned_text'] = data['file_content'].apply(clean)

print(data['cleaned_text'])


17262     ¿1  -   Watch any video online with Open-SUBT...
7294     1  -   ¶ Oh, I know that i s getting late 2  -...
47707    1  -   < Timing and Subtitles by The Uncontrol...
29914    1  -   [ethereal musi 2  -   api.OpenSubtitles...
54266     ¿1  -   [Chri  Oh, no,not the Minibots!2  -  ...
                               ...                        
13664     ¿1  -   We have ran into another problem2  - ...
20329     ¿1  -   If anyone is to end her life,2  -   i...
21234    1  -   Advertise your product or brand herecon...
64710     ¿1  -   Use the free code JOINNOW at  www.pla...
10597    1  -   [intriguing hip-hop music playin 2  -  ...
Name: cleaned_text, Length: 25000, dtype: object


In [11]:
dataset = data[['name','cleaned_text']]

In [6]:
from sentence_transformers import SentenceTransformer
import pickle

In [7]:
# Initialize the "all-MiniLM-L6-v2" model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [16]:


# Function to create embeddings using "all-MiniLM-L6-v2"
def create_embeddings(texts):
    return model.encode(texts, convert_to_tensor=True)

# Connect to SQLite database
conn = sqlite3.connect('chromadb.db')
c = conn.cursor()

# Create a table to store subtitles and their embeddings
c.execute('''CREATE TABLE IF NOT EXISTS subtitles (name TEXT, content TEXT, embedding BLOB)''')

chunk_size = 1000  
number_of_chunks = len(dataset) // chunk_size + (1 if len(dataset) % chunk_size else 0)

for i in range(number_of_chunks):
    # Calculate the start and end indices of the current chunk
    start_index = i * chunk_size
    end_index = start_index + chunk_size
    
    # Process each chunk
    chunk = dataset.iloc[start_index:end_index]
    embeddings = create_embeddings(chunk['cleaned_text'].tolist())
    
    # Reset the index for the chunk to ensure it starts from 0
    chunk = chunk.reset_index(drop=True)
    
    # Store the name, content, and embedding in the database
    for index, row in chunk.iterrows():
        # Serialize the numpy array (embedding) to a bytes object
        serialized_embedding = pickle.dumps(embeddings[index].numpy())
        c.execute('INSERT INTO subtitles (name, content, embedding) VALUES (?, ?, ?)', 
                  (row['name'], row['cleaned_text'], serialized_embedding))
        conn.commit()

# Close the database connection
conn.close()


In [2]:
from sklearn.metrics.pairwise import cosine_similarity

In [3]:


# Function to retrieve the top N most similar subtitles
def retrieve_similar_subtitles(query, top_n=10):
    # Connect to SQLite database
    conn = sqlite3.connect('chromadb.db')
    c = conn.cursor()
    
    # Create embeddings for the query
    query_embedding = model.encode([query], convert_to_tensor=True).numpy()
    
    # Ensure the query_embedding is 2D
    query_embedding = np.array([query_embedding.flatten()])
    
    # Retrieve all subtitles and their embeddings from the database
    c.execute('SELECT name, content, embedding FROM subtitles')
    results = c.fetchall()
    
    # Calculate cosine similarity between query and each subtitle embedding
    similarities = []
    for name, content, stored_embedding in results:
        # Unpickle the stored embedding and ensure it is 2D
        stored_embedding = np.array([pickle.loads(stored_embedding).flatten()])
        
        # Calculate the cosine similarity
        similarity = cosine_similarity(query_embedding, stored_embedding)
        similarities.append((name, content, similarity[0][0]))
    
    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x[2], reverse=True)
    
    # Get the top N results
    top_results = similarities[:top_n]
    
    # Close the database connection
    conn.close()
    subtitles_dict = {}
    for name, content, similarity_score in top_results:
        # Check if the series name is already in the dictionary
        if name not in subtitles_dict:
            subtitles_dict[name] = []
        # Append the subtitle content to the list for this series name
        subtitles_dict[name].append(content)
    
    return subtitles_dict




In [11]:
# Example usage
query_text = "Joey loves food"
top_subtitles = retrieve_similar_subtitles(query_text)
for name, contents in top_subtitles.items():
    print(f"Movie/Series Name: {name}")
    for content in contents:
        print(f"Subtitle: {content}")

Movie/Series Name: once.again.s01.e22.episode.1.22.(2020).eng.1cd
Subtitle: 1  -   (Episode 2 2  -   Here comes the soup!3  -   Is this how i ll be from now on?4  -   Will we have to gather aroundto eat breakfast?5  -   Do t be ridiculous.The re part of the family now,6  -   so we should at leasthave breakfast together.7  -   I s Mr. Son s birthday,so I cooked seaweed soup with beef.8  -   I hope you enjoy it. Now, eat up.9  -   Honey, you should eat the most.10  -   Happy birthday, Mr. Song.11  -   Always be healthy, sir.12  -   Happy birthday, Dad.13  -   Happy birthday.14  -   Happy birthday, Grandpa.15  -   Happy birthday, Mr. Song.16  -   Having a bigger familysure has its perks.17  -   Thanks, everyone. Now, eat up.18  -   - Sure. Thank you.- Le s eat.19  -   Will the party take placeat the merchant association lounge?20  -   Yes, and the foodshould be delivered in advance.21  -   Ga Hee, lend me a hand.22  -   Joon Seon, go to the fish marketand buy some raw fish.23  -   Halibut

In [4]:

# Function to retrieve the top N most similar subtitles
def retrieve_similar_subtitles(query, top_n=10):
    # Connect to SQLite database
    conn = sqlite3.connect('chromadb.db')
    c = conn.cursor()
    
    # Initialize the model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Create embeddings for the query
    query_embedding = model.encode([query], convert_to_tensor=True).numpy()
    
    # Ensure the query_embedding is 2D
    query_embedding = np.array([query_embedding.flatten()])
    
    # Retrieve all subtitles and their embeddings from the database
    c.execute('SELECT name, content, embedding FROM subtitles')
    results = c.fetchall()
    
    # Calculate cosine similarity between query and each subtitle embedding
    similarities = []
    for name, content, stored_embedding in results:
        # Unpickle the stored embedding and ensure it is 2D
        stored_embedding = np.array([pickle.loads(stored_embedding).flatten()])
        
        # Calculate the cosine similarity
        similarity = cosine_similarity(query_embedding, stored_embedding)
        similarities.append((name, content, similarity[0][0]))
    
    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x[2], reverse=True)
    
    # Get the top N results
    top_results = similarities[:top_n]
    
    # Close the database connection
    conn.close()
    
    # Deduplicate the subtitles
    subtitles_dict = {}
    seen_contents = set()
    for name, content, similarity_score in top_results:
        # Check if the subtitle content has already been seen
        if content not in seen_contents:
            # Check if the series name is already in the dictionary
            if name not in subtitles_dict:
                subtitles_dict[name] = []
            # Append the subtitle content to the list for this series name
            subtitles_dict[name].append(content)
            # Mark this content as seen
            seen_contents.add(content)
    
    return subtitles_dict



In [7]:
# Example usage
query_text = "Joey loves food"
top_subtitles = retrieve_similar_subtitles(query_text)
for name, contents in top_subtitles.items():
    print(f"Movie/Series Name: {name}")
    for content in contents:
        print(f"Subtitle: {content}")

Movie/Series Name: once.again.s01.e22.episode.1.22.(2020).eng.1cd
Subtitle: 1  -   (Episode 2 2  -   Here comes the soup!3  -   Is this how i ll be from now on?4  -   Will we have to gather aroundto eat breakfast?5  -   Do t be ridiculous.The re part of the family now,6  -   so we should at leasthave breakfast together.7  -   I s Mr. Son s birthday,so I cooked seaweed soup with beef.8  -   I hope you enjoy it. Now, eat up.9  -   Honey, you should eat the most.10  -   Happy birthday, Mr. Song.11  -   Always be healthy, sir.12  -   Happy birthday, Dad.13  -   Happy birthday.14  -   Happy birthday, Grandpa.15  -   Happy birthday, Mr. Song.16  -   Having a bigger familysure has its perks.17  -   Thanks, everyone. Now, eat up.18  -   - Sure. Thank you.- Le s eat.19  -   Will the party take placeat the merchant association lounge?20  -   Yes, and the foodshould be delivered in advance.21  -   Ga Hee, lend me a hand.22  -   Joon Seon, go to the fish marketand buy some raw fish.23  -   Halibut