# Reading the cleaned data

In [1]:
import pandas as pd
import numpy as np

In [2]:

data=pd.read_csv("Data.csv")

In [3]:
data.shape

(82498, 3)

In [4]:

data.head()

Unnamed: 0,num,name,clean_file_content
0,9180533,the.message.(1976),watch video online opensubtitles free browser ...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,ah there princess dawn terry blooney looney so...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022),iyumis cell iepisode extremely polite yumii iy...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022),watch video online opensubtitles free browser ...
4,9180600,broker.(2022),watch video online opensubtitles free browser ...


In [5]:
data['clean_file_content'][0]

'watch video online opensubtitles free browser extension osdblinkext name god gracious merciful muhammad messenger god heraclius emperor byzantium greeting follower righteous guidance bid hear divine call messenger god people accept islam salvation speaks new prophet arabia like john baptist came king herod desert cry salvation muqawqis patriarch alexandria kisra emperor persia muhammad call call god accept islam salvation embrace islam come desert smelling camel goat tell persia kneel muhammad messenger god gave authority god sent muhammad mercy mankind scholar historian islam university alazhar cairo high islamic congress shiat lebanon maker film honour islamic tradition hold impersonation prophet offends spirituality message therefore person mohammad shown year christ diedi iwhen europe sunk dark agesi iand everywhere old civilization fallingi imuhammad born mecca arabiai imecca rich trading city ruled merchantsi iwhose wealth multiplied unique privilegei ithey housed godsi ievery y

In [6]:
data['clean_file_content'].isna().sum()

1

In [7]:

data.dropna(subset=['clean_file_content'], inplace=True)

In [8]:
# Vectorization 

In [9]:
# Retrieving Documents based on User's Search Query

In [10]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:

# Initialize CountVectorizer and fit-transform the data

In [12]:
count_vectorizer = CountVectorizer()
tf_matrix = count_vectorizer.fit_transform(data['clean_file_content'])

In [13]:

count_vectorizer

In [14]:

# TF-IDF transformer and transform the TF matrix

In [15]:
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(tf_matrix)

In [16]:
tfidf_transformer

In [17]:
tfidf_matrix

<82497x1395141 sparse matrix of type '<class 'numpy.float64'>'
	with 77929374 stored elements in Compressed Sparse Row format>

In [18]:

# Calculating similarity using cosine similarity

In [19]:
query = input()
query_vector = count_vectorizer.transform([query])
query_tfidf = tfidf_transformer.transform(query_vector)

In [20]:
# Compute cosine similarity between the query and documents

In [21]:

similarity_scores = cosine_similarity(query_tfidf, tfidf_matrix)

In [22]:

# Retrieve top similar documents

In [23]:
top_indices = similarity_scores.argsort()[0][::-1]
top_n = 5
retrieved_documents = [data['clean_file_content'][idx] for idx in top_indices[:top_n]]

# retrieved documents
print("Top", top_n, "documents similar to query:", query)
for i, doc in enumerate(retrieved_documents, 1):
    print("Document", i, ":", doc)

Top 5 documents similar to query: 
Document 1 : advertise product brand contact wwwopensubtitlesorg today elizabeth deeply love planned married shortly one tragic day lake brother william come victor im coming william william william he drowning resurrection life said lord believeth though dead yet shall live meet today commend soul young william brother victor frankenstein devoted guardian scripture offer eternal message lord gave lord hath taken away blessed name lord peace join together prayer father victor death peace god u listening pious face god fool sword gun give death cant give life one day god blesses u man wife pair animal life life thats miracle cant raise life death brother corpse thats satan tempted lord satan could teach make william alive id gladly become pupil oh forgive oh please sir youre still hard see elizabeth tell want go back hospital studying objection sir oh dear boy none wondering youre fully fledged doctor thought might like start practice id happy help wel

In [24]:
# Summarizing

In [25]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
def generate_summarized_documents(query, retrieved_documents):
    summarized_documents = {}
    for i, doc in enumerate(retrieved_documents, 1):
        summary = "Summary: " + doc[:] + "" if len(doc) > 150 else "Summary: " + doc
        summarized_documents["Document " + str(i)] = summary
    return summarized_documents

In [27]:
query = input("Enter your query: ")
query_vector = count_vectorizer.transform([query])
query_tfidf = tfidf_transformer.transform(query_vector)

# cosine similarity between the query and documents
similarity_scores = cosine_similarity(query_tfidf, tfidf_matrix)

In [28]:

# top similar documents
top_indices = similarity_scores.argsort()[0][::-1]
top_n = 5
retrieved_documents = [data['clean_file_content'][idx] for idx in top_indices[:top_n]]

In [29]:
# summarized documents
summarized_docs = generate_summarized_documents(query, retrieved_documents)

# Print the summarized documents
for doc, summary in summarized_docs.items():
    print(doc + ":")
    print(summary)
    print()

Document 1:
Summary: advertise product brand contact wwwopensubtitlesorg today elizabeth deeply love planned married shortly one tragic day lake brother william come victor im coming william william william he drowning resurrection life said lord believeth though dead yet shall live meet today commend soul young william brother victor frankenstein devoted guardian scripture offer eternal message lord gave lord hath taken away blessed name lord peace join together prayer father victor death peace god u listening pious face god fool sword gun give death cant give life one day god blesses u man wife pair animal life life thats miracle cant raise life death brother corpse thats satan tempted lord satan could teach make william alive id gladly become pupil oh forgive oh please sir youre still hard see elizabeth tell want go back hospital studying objection sir oh dear boy none wondering youre fully fledged doctor thought might like start practice id happy help well thats kind sir there some

In [30]:
import joblib

# CountVectorizer
joblib.dump(count_vectorizer, 'count_vectorizer.joblib')

['count_vectorizer.joblib']

In [31]:
# TfidfTransformer
joblib.dump(tfidf_transformer, 'tfidf_transformer.joblib')

['tfidf_transformer.joblib']

In [32]:

joblib.dump(tfidf_matrix, 'tfidf_matrix.joblib')

['tfidf_matrix.joblib']

In [33]:
# cosine similarity model
joblib.dump(similarity_scores, 'cosine_similarity_scores.joblib')

['cosine_similarity_scores.joblib']

# END