# **Project - Enhancing Search Engine Relevance for Video Subtitles**

In [1]:
import pandas as pd
import numpy as np

# **Part 1: Ingesting Documents**

In [1]:
import sqlite3

# Reading the Database file
conn = sqlite3.connect(r"C:\Users\Admin\Downloads\eng_subtitles_database.db")
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

[('zipfiles',)]


In [3]:
# Reading the columns of Table
cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

num
name
content


In [4]:
# Loading the Database Table inside a Pandas DataFrame
df = pd.read_sql_query("""SELECT * FROM zipfiles""", conn)
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82498 entries, 0 to 82497
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      82498 non-null  int64 
 1   name     82498 non-null  object
 2   content  82498 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


In [6]:
# Applying the above Function on the Entire Data
import zipfile
import io

count = 0

def decode_method(binary_data):
    global count
    # Decompress the binary data using the zipfile module
    # print(count, end=" ")
    count += 1
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            # Assuming there's only one file in the ZIP archive
            subtitle_content = zip_file.read(zip_file.namelist()[0])

    # Now 'subtitle_content' should contain the extracted subtitle content
    return subtitle_content.decode('latin-1')  # Assuming the content is UTF-8 encoded text

In [7]:
df['file_content'] = df['content'].apply(decode_method)

df.head()

Unnamed: 0,num,name,content,file_content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...,"1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther..."
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...,"1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'..."
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."


In [10]:
import random

random_df = df.sample(frac=0.1, random_state=42)

print(random_df.shape)

random_df.head()

(8250, 4)


Unnamed: 0,num,name,content,file_content
17262,9251120,maybe.this.time.(2014).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x89\x9a\x...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."
7294,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,b'PK\x03\x04\x14\x00\x00\x00\x08\x007\x8f\x99V...,"1\r\n00:00:09,275 --> 00:00:11,876\r\n¶ Oh, I ..."
47707,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x8f\x19\x...,"1\r\n00:00:07,140 --> 00:00:14,220\r\n<i>Timin..."
29914,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00[\xaa\x99V...,"1\r\n00:00:06,133 --> 00:00:08,900\r\n[etherea..."
54266,9408707,battlebots.(2015).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xf4<\x9aV...,"ï»¿1\r\n00:00:01,480 --> 00:00:03,570\r\n[Chri..."


In [12]:
random_df.to_csv(r"C:\Users\Admin\OneDrive\Desktop\intership2024\SearchEngine\random_sample.csv", index=False, escapechar='\\')

# Preprocessing the data

In [13]:
df = pd.read_csv(r"C:\Users\Admin\OneDrive\Desktop\intership2024\SearchEngine\random_sample.csv")

print(df.shape)

df.head()

(8250, 4)


Unnamed: 0,num,name,content,file_content
0,9251120,maybe.this.time.(2014).eng.1cd,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch..."
1,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x007\...,"1\r\n00:00:09,275 --> 00:00:11,876\r\n¶ Oh, I ..."
2,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"1\r\n00:00:07,140 --> 00:00:14,220\r\n<i>Timin..."
3,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00[\...,"1\r\n00:00:06,133 --> 00:00:08,900\r\n[etherea..."
4,9408707,battlebots.(2015).eng.1cd,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"ï»¿1\r\n00:00:01,480 --> 00:00:03,570\r\n[Chri..."


In [14]:
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
def clean_subtitle(subtitle):
    # Remove timestamps
    clean_content = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n?', '', subtitle)

    # Remove other non-textual patterns
    clean_content = re.sub(r'<[^>]+>', '', clean_content)

    clean_content = re.sub(r"[^\w\s]", '', clean_content)

    clean_content = re.sub(r"[^\x00-\x7F]+", '', clean_content)

    clean_content = re.sub(r"\b\d+\s", '', clean_content)

    # Convert to lowercase
    clean_content = clean_content.lower()

    # Tokenize the subtitle content
    tokens = word_tokenize(clean_content)

    # Remove stopwords and lemmatize tokens
    stop_words = set(stopwords.words('english'))

    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.lower() not in stop_words]

    # Join the filtered tokens back into a string
    clean_content = ' '.join(clean_tokens)

    return clean_content.strip()

In [16]:
df['cleaned_content'] = df['file_content'].apply(clean_subtitle)

In [17]:
df.head()

Unnamed: 0,num,name,content,file_content,cleaned_content
0,9251120,maybe.this.time.(2014).eng.1cd,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...",watch video online opensubtitles free browser ...
1,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x007\...,"1\r\n00:00:09,275 --> 00:00:11,876\r\n¶ Oh, I ...",oh know getting late dont wan na go home im hu...
2,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"1\r\n00:00:07,140 --> 00:00:14,220\r\n<i>Timin...",timing subtitle uncontrollable lovebird team v...
3,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00[\...,"1\r\n00:00:06,133 --> 00:00:08,900\r\n[etherea...",ethereal music apiopensubtitlesorg deprecated ...
4,9408707,battlebots.(2015).eng.1cd,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"ï»¿1\r\n00:00:01,480 --> 00:00:03,570\r\n[Chri...",chris oh minibots yelling oh leave little bot ...


In [18]:
df['cleaned_content'][2]

'timing subtitle uncontrollable lovebird team viki episode watch video online opensubtitles free browser extension osdblinkext came way take island ill go take along feel like want hide away island let go anyone else dont ever appear front eye see kidnap sleeping right worrying person much sleep shooting film wouldnt know youre actor kind must crazy kind crazy thing done spent much expensive taxi ride get seoul couldnt even sleep cold shivering waiting boat got hand bitten seagull ah money could fed jik much meat money one think dream chance dream eul came way see two wow really think youre dreaming dont im telling im really eul three get pinched order wake go said go get sight must caused big accident exactly made thing worse ever drink im better dog dog seriously rest one soul one last night remembering gave let go didnt eat breakfast right go sorry tell become dog drink path crossed whats even amazing dont even remember second ive done drunk four dont know drunk old well look order 

In [19]:
df.to_csv(r"C:\Users\Admin\OneDrive\Desktop\intership2024\SearchEngine/cleaned_data.csv", index=False)

In [20]:
df = pd.read_csv(r"C:\Users\Admin\OneDrive\Desktop\intership2024\SearchEngine/cleaned_data.csv")

print(df.shape)

df.head()

(8250, 5)


Unnamed: 0,num,name,content,file_content,cleaned_content
0,9251120,maybe.this.time.(2014).eng.1cd,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...",watch video online opensubtitles free browser ...
1,9211589,down.the.shore.s01.e10.and.justice.for.all.(19...,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x007\...,"1\r\n00:00:09,275 --> 00:00:11,876\r\n¶ Oh, I ...",oh know getting late dont wan na go home im hu...
2,9380845,uncontrollably.fond.s01.e07.heartache.(2016).e...,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"1\r\n00:00:07,140 --> 00:00:14,220\r\n<i>Timin...",timing subtitle uncontrollable lovebird team v...
3,9301436,screen.two.s13.e04.the.precious.blood.(1996).e...,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00[\...,"1\r\n00:00:06,133 --> 00:00:08,900\r\n[etherea...",ethereal music apiopensubtitlesorg deprecated ...
4,9408707,battlebots.(2015).eng.1cd,b'PK\\x03\\x04\\x14\\x00\\x00\\x00\\x08\\x00\\...,"ï»¿1\r\n00:00:01,480 --> 00:00:03,570\r\n[Chri...",chris oh minibots yelling oh leave little bot ...


# Document Chunker

In [22]:
import nltk
from nltk.tokenize import word_tokenize

# Function to count tokens
def count_tokens(text):
    tokens = word_tokenize(text)
    return len(tokens)

# Iterate through data and count tokens for each file
for index, entry in df.iterrows():
    total_tokens = count_tokens(entry["cleaned_content"])
    print(f"File: {entry['name']}, Number of Tokens: {total_tokens}")

File: maybe.this.time.(2014).eng.1cd, Number of Tokens: 5335
File: down.the.shore.s01.e10.and.justice.for.all.(1992).eng.1cd, Number of Tokens: 1356
File: uncontrollably.fond.s01.e07.heartache.(2016).eng.1cd, Number of Tokens: 2425
File: screen.two.s13.e04.the.precious.blood.(1996).eng.1cd, Number of Tokens: 3588
File: battlebots.(2015).eng.1cd, Number of Tokens: 7743
File: csi.crime.scene.investigation.s08.e16.two.and.a.half.deaths.(2008).eng.1cd, Number of Tokens: 2998
File: royal.ashes.().eng.1cd, Number of Tokens: 5537
File: return.to.seoul.(2022).eng.1cd, Number of Tokens: 2503
File: idris.elba.king.of.speed.s01.e02.episode.1.2.(2013).eng.1cd, Number of Tokens: 3433
File: tooth.pari.when.love.bites.s01.e08.episode.1.8.(2023).eng.1cd, Number of Tokens: 1863
File: studio.one.s08.e30.the.arena.(1956).eng.1cd, Number of Tokens: 3623
File: love.life.s02.e10.epilogue.(2021).eng.1cd, Number of Tokens: 2011
File: scrubs.s03.e14.my.screwup.(2004).eng.1cd, Number of Tokens: 1416
File: devot

In [23]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
# Define the document chunker function
def document_chunker(document, chunk_size=1500, overlap_size=200):
    tokens = word_tokenize(document)  # Tokenize the document
    num_tokens = len(tokens)
    chunks = []

    # Iterate over the tokens and create chunks with specified overlap
    for start in range(0, num_tokens, chunk_size - overlap_size):
        end = min(start + chunk_size, num_tokens)
        chunk = tokens[start:end]
        chunks.append({'name': row['name'], 'chunk_index': len(chunks) + 1, 'chunk_text': ' '.join(chunk)})

    return chunks

In [25]:
# Apply the document chunker to each cleaned document in the DataFrame
all_chunks = []
for index, row in df.iterrows():
    document = row['cleaned_content']
    document_chunks = document_chunker(document)
    all_chunks.extend(document_chunks)

# Create a new DataFrame from the list of chunks
chunks_df = pd.DataFrame(all_chunks)

In [26]:
print(chunks_df.shape)

chunks_df.head()

(20883, 3)


Unnamed: 0,name,chunk_index,chunk_text
0,maybe.this.time.(2014).eng.1cd,1,watch video online opensubtitles free browser ...
1,maybe.this.time.(2014).eng.1cd,2,also answer question invest something trendy r...
2,maybe.this.time.(2014).eng.1cd,3,kitchen thats learned cook hmm thats miss moni...
3,maybe.this.time.(2014).eng.1cd,4,take ah wow like newlywed maybe time itll lovi...
4,maybe.this.time.(2014).eng.1cd,5,time maybe time love wont end two old friend m...


In [27]:
chunks_df.to_csv(r"C:\Users\Admin\OneDrive\Desktop\intership2024\SearchEngine\chunks_data1.csv", index=False)

# Vectorization Techniques

In [28]:
import pandas as pd
df1 = pd.read_csv(r"C:\Users\Admin\OneDrive\Desktop\intership2024\SearchEngine\chunks_data1.csv")

print(df1.shape)

df1.head()

(20883, 3)


Unnamed: 0,name,chunk_index,chunk_text
0,maybe.this.time.(2014).eng.1cd,1,watch video online opensubtitles free browser ...
1,maybe.this.time.(2014).eng.1cd,2,also answer question invest something trendy r...
2,maybe.this.time.(2014).eng.1cd,3,kitchen thats learned cook hmm thats miss moni...
3,maybe.this.time.(2014).eng.1cd,4,take ah wow like newlywed maybe time itll lovi...
4,maybe.this.time.(2014).eng.1cd,5,time maybe time love wont end two old friend m...


In [29]:
df1['chunk_text'][111]

'advertise product brand contact wwwopensubtitlesorg today congratulation rabbi oh well rabbi wife asks tell im lasri send father warm gratitude chandelier beautiful im asked right educational method yeshiva headmaster turned complained yeshiva student went ski vacation ski vacation five day complained ignore keep theyre good men study excel wanted day break expel yeshiva unacceptable every time student finish mishna chapter theyll go trip school would stay empty gmara book would left unread corner torah headmaster asked correct educational method teach boy way impose mountain basin answer ill tell right method right method daniel paran dori medium darset present created written directed eliran malka creator main editor daniel paran directed maor zaguri new black gedaliah stop bugging sorry rabbi come need come well yesterday awesome im gedaliah hi im talus usually break ice begin date story first date rabbi ovadia wife told information im going study torah wont clean wont cook wont an

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
# Calculate TF-IDF for the documents
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df1['chunk_text'])

In [32]:
query="thats key hard tell jeromelike delivery hold back growth literally nightmare scenario college debt country would eighth wealthiest country world kevin connell falling almost student debt wrote two book exposing corruption student debt system time one word would describe student debt system predatory conditioning student face entire life saying must go college order successful people willing pay whatever take go college prominent economist agree economist prominent economist shoot good overlapping chatter get prominent economist big problem would say oh immoral problem impose upon young people narrative tell go school better college key mobility young person year old making authentic choice well feel like ive done everything right growing know im told go school work hard whatever want whatever want kinda like okay um know im happened talked people different community across country see student debt affected adult would tell go college like college degree would mean getting goodpaying job buy house live life want know seems silly imagine served eight year united state air force kind like supposed contributed country yet student loan put debt thought would avoided joining military supposed thought achieving american dream american dream mean thing american dream zero dollar lot people mean many people negative net worth cant reach american dream zero dollar dont worry medium back almost college student using student loan pay spring break trip important learn lesson early personal responsibility pull bootstrap adult talking need realize take debt liable repay debt someone might say listen took loan pay back simple like dont irresponsible say think would said totally make sense eight year ago let forget kid told sign dotted line parental consent necessary 17yearold self unprepared detail medium say thats excuse student knew exactly getting knew getting someone say dont get people dont know theyre actually signing there whole series gimmick played industry attempt cloud actual picture look like lender become creative inflate loan give teaser rate start artificially low thats right college lender using classic first one free move used everything streaming service cocaine im told thats trick sleeve lot fine print overwhelming maybe make underpayment one month pyramid fee dont notice end defaulting wage garnished come say nope taking wage without sort due process thing social security youre teenager signing paper dont listen adult room guide remember guidance counselor reeked coffee pretended know name truly dont think guidance counselor parent family dont know anyone understood true detriment okay blake come bring banana stick banana right foot like yes position way oh like right make sure there nothing else background see foot banana hows real reason came see marcella daughter student debt control running active business creating erotic video help pay every day wake purpose day make sure child better life oldest daughter prepare first four year college started kindergarten young know going go master wasnt prepared know gon na want go law school told encouraged yes okay great need marcellas daughter needed take loan im trying work hard finish gon na wait minute marcella started loan salary going paying student loan right even four year residency im gon na basically debt ive talked people say ive paying student loan year ive making payment ive time yet owe beginning phenomenon compounding interest okay let stop second compounding interest pretty tricky understand here need know loan make pay small amount interest compounding interest charge interest interest making balance grow exponentially quickly get control person student debt may say know gosh really messed get like million people youre like okay maybe systemic maybe isnt couple people made bad decision maybe actually orchestrated scam im convinced system rigged create crisis today thats learned piece puzzle got little attention would change everything could point one thing student debt system one thing would change would bankruptcy mean intent impossible discharge student loan bankruptcy wait thought debt could discharged bankruptcy except student loan essentially took bankruptcy protection permanently away time federal student loan could true isnt bankruptcy bad thing think first time heard word bankrupt watching wheel fortune audience go aw lose money assumed thats something dont want well ive many client postbankruptcy go buy house theyve never able ive two client actually become multimillionaires hard would discharge student loan debt bankruptcy impossible guess bankruptcy reserved truly neediest case right fair deserve fresh start mostly student borrower deserve option doesnt mean everyones going run file bankruptcy even there something much bigger play absence threat bankruptcy open door widespread abuse lender without bankruptcy option college lender frivolously lend much money anybody want apply repayment guaranteed student pay back matter bankruptcy option lender know cant run cant hide essentially give license steal lender incentive restrict theyre lending much loan guess happens tuition cost college raised price essentially youre paying college lender know obscene tuition ridiculous gimmick college play richer college lender become threat bankruptcy force lender behave rationally modicum good faith without bankruptcy protection left selfperpetuating system greed know outset legislatively manufactured crisis reauthorization higher education act senate vote adoption conference report accompanying hr6 mr biden mr bingaman mr durbin mr feingold mr feinstein mr ford like uproar happened like big headline time headline fact dont think congressman even knew happened probably week month maybe even year afterwards bill sign moment enhance economic strength america massive change economy would affect people one noticed sneaky two line slipped conference committee dark night two line legal mumbo jumbo would alter economy multiple generation almost entirely technically bankruptcy code say represents undue hardship get discharge undue hardship chuckle shouldnt hard prove well court interpreted saying mean there hope repaying mean there supposed hope condition youd never want get student loan discharged mean kind condition well might serious cancer physical disability let understand way get rid student debt right cancer thats probably best way well unfortunately would help okay responsible craziness there name find attached nobody seems willing step take credit wait wait wait nobody put name dontwe dont actually know like whisper wind dont know something bear repeating according alan everything ballooning student debt spiraling college cost traced back one twoline provision law"

In [33]:

cleaned_query = clean_subtitle(query)

# Calculate TF-IDF for the query
query_tfidf = tfidf_vectorizer.transform([cleaned_query])

# Calculate cosine similarity between the query and documents
cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix)

# Get the indices of documents sorted by similarity score
sorted_indices = cosine_similarities.argsort()[0][::-1]

In [34]:

# Return top 5 most similar chunk_text
top_k = 5
top_documents = df1.loc[sorted_indices[:top_k], 'chunk_text'].tolist()
print("Top 5 most similar chunk_text:")
for i, text in enumerate(top_documents, 1):
    print(f"Document {i}: {text}")

Top 5 most similar chunk_text:
Document 2: right even four year residency im gon na basically debt ive talked people say ive paying student loan year ive making payment ive time yet owe beginning phenomenon compounding interest okay let stop second compounding interest pretty tricky understand here need know loan make pay small amount interest compounding interest charge interest interest making balance grow exponentially quickly get control person student debt may say know gosh really messed get like million people youre like okay maybe systemic maybe isnt couple people made bad decision maybe actually orchestrated scam im convinced system rigged create crisis today thats learned piece puzzle got little attention would change everything could point one thing student debt system one thing would change would bankruptcy mean intent impossible discharge student loan bankruptcy wait thought debt could discharged bankruptcy except student loan essentially took bankruptcy protection permanen

In [35]:
!pip install sentence-transformers

Defaulting to user installation because normal site-packages is not writeable


In [36]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [37]:
df1 = pd.read_csv(r"C:\Users\Admin\OneDrive\Desktop\intership2024\SearchEngine\chunks_data1.csv")

print(df1.shape)

df1.head()

(20883, 3)


Unnamed: 0,name,chunk_index,chunk_text
0,maybe.this.time.(2014).eng.1cd,1,watch video online opensubtitles free browser ...
1,maybe.this.time.(2014).eng.1cd,2,also answer question invest something trendy r...
2,maybe.this.time.(2014).eng.1cd,3,kitchen thats learned cook hmm thats miss moni...
3,maybe.this.time.(2014).eng.1cd,4,take ah wow like newlywed maybe time itll lovi...
4,maybe.this.time.(2014).eng.1cd,5,time maybe time love wont end two old friend m...


In [38]:
# Encode the chunk text to get semantic embeddings
chunk_embeddings = model.encode(df1['chunk_text'].tolist())

# Add the embeddings to the DataFrame
df1['embeddings'] = chunk_embeddings.tolist()

In [39]:
df1.head()

Unnamed: 0,name,chunk_index,chunk_text,embeddings
0,maybe.this.time.(2014).eng.1cd,1,watch video online opensubtitles free browser ...,"[-0.06415261328220367, -0.09905608743429184, 0..."
1,maybe.this.time.(2014).eng.1cd,2,also answer question invest something trendy r...,"[0.04669870808720589, -0.06714576482772827, 0...."
2,maybe.this.time.(2014).eng.1cd,3,kitchen thats learned cook hmm thats miss moni...,"[0.03490824997425079, -0.08582150191068649, 0...."
3,maybe.this.time.(2014).eng.1cd,4,take ah wow like newlywed maybe time itll lovi...,"[-0.12172219902276993, -0.1007562130689621, 0...."
4,maybe.this.time.(2014).eng.1cd,5,time maybe time love wont end two old friend m...,"[-0.1321333646774292, -0.0960313007235527, 0.0..."


In [40]:
df1.tail()

Unnamed: 0,name,chunk_index,chunk_text,embeddings
20878,never.love.a.stranger.(1958).eng.1cd,2,mistake mean thought id better telling tonight...,"[-0.03054605983197689, -0.0015236519975587726,..."
20879,never.love.a.stranger.(1958).eng.1cd,3,see frank came see want make deal deal frank k...,"[-0.09295785427093506, -0.04963499680161476, -..."
20880,never.love.a.stranger.(1958).eng.1cd,4,gon na get love frankie always cant stay youre...,"[-0.06650276482105255, -0.06917943805456161, -..."
20881,the.power.s01.e01.a.better.future.is.in.your.h...,1,advertise product brand contact wwwopensubtitl...,"[-0.07300365716218948, -0.12829557061195374, 0..."
20882,the.power.s01.e01.a.better.future.is.in.your.h...,2,feel like biggest static shock ever nervously ...,"[-0.10172726958990097, -0.11791297793388367, 0..."


In [41]:
print(df1.shape)

df1.head()

(20883, 4)


Unnamed: 0,name,chunk_index,chunk_text,embeddings
0,maybe.this.time.(2014).eng.1cd,1,watch video online opensubtitles free browser ...,"[-0.06415261328220367, -0.09905608743429184, 0..."
1,maybe.this.time.(2014).eng.1cd,2,also answer question invest something trendy r...,"[0.04669870808720589, -0.06714576482772827, 0...."
2,maybe.this.time.(2014).eng.1cd,3,kitchen thats learned cook hmm thats miss moni...,"[0.03490824997425079, -0.08582150191068649, 0...."
3,maybe.this.time.(2014).eng.1cd,4,take ah wow like newlywed maybe time itll lovi...,"[-0.12172219902276993, -0.1007562130689621, 0...."
4,maybe.this.time.(2014).eng.1cd,5,time maybe time love wont end two old friend m...,"[-0.1321333646774292, -0.0960313007235527, 0.0..."


In [42]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20883 entries, 0 to 20882
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         20883 non-null  object
 1   chunk_index  20883 non-null  int64 
 2   chunk_text   20883 non-null  object
 3   embeddings   20883 non-null  object
dtypes: int64(1), object(3)
memory usage: 652.7+ KB


# 5. Store embeddings in a ChromaDB database

In [43]:
!pip install chromadb

Defaulting to user installation because normal site-packages is not writeable


In [44]:
import chromadb

# Create a client instance
client = chromadb.Client()



In [45]:
collection = client.create_collection(
        name="searchengine",
        metadata={"hnsw:space": "cosine"} # l2 is the default
    )

In [46]:
for i, embedding in enumerate(chunk_embeddings):
    collection.add(ids=f"chunk_{i}",  # Replace with a more descriptive ID scheme if needed
                   embeddings=embedding.tolist(),  # Convert to a list for Chroma
                   documents=df1.loc[i, "chunk_text"],  # Use chunk_text as documents
                   metadatas={"name": df1.loc[i, "name"]}  # Use 'name' column data as metadata
    )


# **Part 2: Retrieving Documents**

In [47]:
from sentence_transformers import SentenceTransformer, util

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [48]:
subtitle_text = "help kiss excuse im gon na kiss need facial facial ouch thats gon na hurt tell straight english dont want wear wont force try okay go ahead dont like doesnt unlike doesnt like unli unlike fine give like doesnt dont right stay still sir tep really hurt love miss monica remember know say love hurt wow looking good hmm think look good mmm modern cool whats last one posh posh posh mama mae yeah ill posh mmhmm let posh mama mae ow ow im sorry hurt done bit ow ooh ow hand hurt sorry sorry sir please hold still look yuck dinner fork good salad fork good mmm okay "
cleaned_subtitle = clean_subtitle(subtitle_text)

In [49]:

# Encode the cleaned subtitle text to get semantic embeddings
query_embedding = model.encode([cleaned_subtitle])

In [50]:
# Perform a query on the ChromaDB collection using the embeddings
results = collection.query(
    query_embeddings=query_embedding,
    n_results=5,
    include=['documents']
)


In [51]:
documents = results['documents']

# Iterate over the documents and print each document
for i, query_documents in enumerate(documents):
    for j, document in enumerate(query_documents):
        print(f"Document {i+1}, Item {j+1}: {document}")



Document 1, Item 1: teeth last time wed done twice would really embarrassed idve thought oh god one u obviously bad kisser harry oh god im shocked mum youre one million card call boy say special talent pulling milfs feel like annamay deep wanted kiss kai cant lie teddy bear chain saying sergeant snake boy told girl joining army stop seeing rude olivias one bit long like roundup kiss hold one like ouch shaq ever find lied well shes gon na see see weve got hair removal cream boy blistered entire body using hairremoval cream lad holiday right kiss frog find prince fake foot boy claim nan initial tattooed foot really girl slept lad holiday uh oh trouble somethings come along burst bubble ive got menu say porn cocktail chicken tikka massageher dessert ice makeherscream oh god thats vibe wow boy caught sex restaurant waiter oh feel like ive kissed boy honestly think kissed challenge girl cleaned well actually theyve cleaned girl back villa shaqs realising come tanya he got major feeling coin