In [7]:
import pandas as pd
import numpy as np
import sqlite3
import warnings
from warnings import filterwarnings

In [8]:
conn = sqlite3.connect('eng_subtitles_database.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

[('zipfiles',)]


In [9]:
cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

num
name
content


In [10]:
df = pd.read_sql_query("""SELECT * FROM zipfiles""", conn)
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


In [11]:
data= df.sample(n=25000, random_state=42)

In [12]:
import zipfile
import io

count = 0

def decode_method(binary_data):
    global count
    # Decompress the binary data using the zipfile module
    # print(count, end=" ")
    count += 1
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            # Assuming there's only one file in the ZIP archive
            subtitle_content = zip_file.read(zip_file.namelist()[0])

    # Now 'subtitle_content' should contain the extracted subtitle content
    return subtitle_content.decode('latin-1')

In [13]:
data['file_content'] = data['content'].apply(decode_method)

data.head()

Unnamed: 0,num,name,content,file_content
17262,9251120,maybe.this.time.(2014).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x89\x9a\x...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."
7294,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,b'PK\x03\x04\x14\x00\x00\x00\x08\x007\x8f\x99V...,"1\r\n00:00:09,275 --> 00:00:11,876\r\n¶ Oh, I ..."
47707,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x8f\x19\x...,"1\r\n00:00:07,140 --> 00:00:14,220\r\n<i>Timin..."
29914,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00[\xaa\x99V...,"1\r\n00:00:06,133 --> 00:00:08,900\r\n[etherea..."
54266,9408707,battlebots.(2015).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xf4<\x9aV...,"ï»¿1\r\n00:00:01,480 --> 00:00:03,570\r\n[Chri..."


In [14]:
import re


In [15]:

def clean(doc): 
  doc = doc.replace("\r\n","")
  doc = re.sub(r"\d{2}:\d{2}:\d{2},\d{3}|.[^a-zA-Z0-9\s.,!?-]", " ", doc)  
  return doc

data['cleaned_text'] = data['file_content'].apply(clean)

print(data['cleaned_text'])


17262     ¿1  -   Watch any video online with Open-SUBT...
7294     1  -   ¶ Oh, I know that i s getting late 2  -...
47707    1  -   < Timing and Subtitles by The Uncontrol...
29914    1  -   [ethereal musi 2  -   api.OpenSubtitles...
54266     ¿1  -   [Chri  Oh, no,not the Minibots!2  -  ...
                               ...                        
13664     ¿1  -   We have ran into another problem2  - ...
20329     ¿1  -   If anyone is to end her life,2  -   i...
21234    1  -   Advertise your product or brand herecon...
64710     ¿1  -   Use the free code JOINNOW at  www.pla...
10597    1  -   [intriguing hip-hop music playin 2  -  ...
Name: cleaned_text, Length: 25000, dtype: object


In [16]:
dataset = data[['name','cleaned_text']]

In [17]:
from sentence_transformers import SentenceTransformer


In [4]:
import chromadb

In [18]:
chunk_size = 1000  
number_of_chunks = len(dataset) // chunk_size + (1 if len(dataset) % chunk_size else 0)


for i in range(number_of_chunks):
    # Calculate the start and end indices of the current chunk
    start_index = i * chunk_size
    end_index = min(start_index + chunk_size, len(dataset))
    
    # Process each chunk
    chunk = dataset.iloc[start_index:end_index]

In [19]:
# Initialize the "all-MiniLM-L6-v2" model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize chromadb client and collection
client = chromadb.PersistentClient(path='SearchEngine_db')
collection = client.get_or_create_collection(name='MyCollection')

# Assuming 'chunk' is a DataFrame with 'cleaned_text' and other relevant information
for index, row in chunk.iterrows():
    # Convert the numpy array (embedding) to a flat list of floats
    embedding_list = model.encode([row['cleaned_text']], convert_to_tensor=True).numpy().flatten().tolist()
    
    # Prepare metadata, which includes the subtitle content
    metadata = {'name': row['name'], 'content': row['cleaned_text']}
    
    # Add the entry to the chromadb collection using the serial number as the unique ID
    collection.add(embeddings=[embedding_list], ids=[str(index)], metadatas=[metadata])




In [20]:
results = collection.query(
    query_texts=["Joey loves food"],
    n_results=10
)

In [22]:
results

{'ids': [['37426',
   '57596',
   '20751',
   '72890',
   '25540',
   '25760',
   '53288',
   '70930',
   '59366',
   '2772']],
 'distances': [[1.3330777883529663,
   1.3406963348388672,
   1.4644542932510376,
   1.4683555364608765,
   1.486175537109375,
   1.4954206943511963,
   1.5309836864471436,
   1.5659197568893433,
   1.5796825885772705,
   1.5836718082427979]],
 'metadatas': [[{'content': ' ¿1  -   Use the free code JOINNOW at  www.playships.eu2  -   YOU LOOK CUTE IN BUBBLES.3  -   AH, YO RE JUST ALL LIQUORED UP.4  -      knock on doo / )5  -   Joe  HEY, I S ME.6  -    M COMING IN.7  -    VE HAD A VERY LONG, HARD DAY.8  -   YO RE HOME EARLY.9  -   YEAH, MY DATE THREW UP.10  -    M GOING TO GO GET SOMECHICKEN. YOU WANT SOME?11  -   NO, THANKS. NO CHICKEN.12  -   BYE-BYE THEN.13  -   OKAY.14  -   YOU SURE? SOME EXTRA CRISPY?15  -   DIRTY RICE? BEANS?16  -   FOR THE LAST TIME, NO.17  -   GET OUT. GET OUT, JOEY.18  -   ALL RIGHT.19  -      gaspin / )20  -   ARE YOU OKAY?21  -    M 

In [21]:
results['metadatas']

[[{'content': ' ¿1  -   Use the free code JOINNOW at  www.playships.eu2  -   YOU LOOK CUTE IN BUBBLES.3  -   AH, YO RE JUST ALL LIQUORED UP.4  -      knock on doo / )5  -   Joe  HEY, I S ME.6  -    M COMING IN.7  -    VE HAD A VERY LONG, HARD DAY.8  -   YO RE HOME EARLY.9  -   YEAH, MY DATE THREW UP.10  -    M GOING TO GO GET SOMECHICKEN. YOU WANT SOME?11  -   NO, THANKS. NO CHICKEN.12  -   BYE-BYE THEN.13  -   OKAY.14  -   YOU SURE? SOME EXTRA CRISPY?15  -   DIRTY RICE? BEANS?16  -   FOR THE LAST TIME, NO.17  -   GET OUT. GET OUT, JOEY.18  -   ALL RIGHT.19  -      gaspin / )20  -   ARE YOU OKAY?21  -    M SO SORRY. HE WOULD T LEAVE.22  -   HE KEPT ASKING ME23  -   IF I WANTED CHICKEN.24  -   CHICKEN?25  -   I COULD EAT SOME CHICKEN.26  -   HEY, JOE!27  -   YEAH, COULD I GET ATHREE-PIECE, SOME COLE SLAW28  -   AND SOME BEANS AND A COKE...29  -      screamin / )30  -   DIET COKE.31  -   <font colo  ffff0 >Captioning sponsoredb /fon  font colo  ffff0 >WARNER BROS /fon 32  -    ª SO NO ON