In [1]:
import PyPDF2
import fitz  # PyMuPDF
import re

In [4]:
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def extract_text_from_pdf(pdf_path):
    text = {}
    pdf_document = fitz.open(pdf_path)
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        textInfo = page.get_text("blocks", flags=1+2+8)
        paragraphs = [re.sub('\n', ' ', t[4]) for t in textInfo]
        text[page_num] = paragraphs
    pdf_document.close()
    return text

In [3]:
def merge_strings_until_limit(strings, min_length, max_length):
    merged_strings = []
    current_string = ""

    for s in strings:
        if len(current_string) + len(s) <= max_length:
            current_string += s
        else:
            merged_strings.append(current_string)
            current_string = s

    if current_string:
        merged_strings.append(current_string)

    return merged_strings

In [5]:
def is_english_word(word):
    return word in words.words()

In [6]:
def preprocess(text):
    processed_text = []
    for t in text:
        preprocessed_text = gensim.utils.simple_preprocess(t, min_len=3, deacc=True)
        lemmatized_text = [lemmatizer.lemmatize(word) for word in preprocessed_text]
        filtered_text = [word for word in lemmatized_text if word not in stop_words and is_english_word(word)]
        processed_text.append(" ".join(filtered_text))
    return processed_text

In [7]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [159]:
pdf_file_path = r'C:\Users\SherryAi\Desktop\books\contemporary\c\The shadow of the wind c.pdf'
output_text = extract_text_from_pdf(pdf_file_path)
merged_strings = []

for i in output_text:
    merged_strings.extend(merge_strings_until_limit(output_text[i], 20, 500))

merged_strings[0]

'AHRReviewRoundtable IntheShadowoftheWind SASHATURNER '

In [160]:
pretext = preprocess(merged_strings)
pretext[1]

'rightly year old sang reform sailor runaway idea colonial'

In [161]:
vectorizer = TfidfVectorizer(min_df=2)
tfidf_matrix = vectorizer.fit_transform(pretext)
tfidf_matrix

<28x25 sparse matrix of type '<class 'numpy.float64'>'
	with 83 stored elements in Compressed Sparse Row format>

In [162]:
vectorizer.get_feature_names_out()
len(vectorizer.get_feature_names_out())

25

In [163]:
query = 'KNIGHT symbolizes valor, chivalry, and duty. It is a figure embodying courage, honor, and a commitment to protect and serve, often guided by a moral and ethical code. Knights represent the blend of strength, loyalty, and a sense of justice in service to a higher cause or realm.'
prequery = preprocess([query])  # Pass query as a list
prequery


['knight valor chivalry duty figure courage honor commitment protect serve often moral ethical code knight represent blend strength loyalty sense justice service higher cause realm']

In [164]:
# Assuming 'original_texts' is a list of your original documents
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit(pretext)  # Learn vocabulary from the original dataset

# Transform the original data (if you don't already have tfidf_matrix)
tfidf_matrix = vectorizer.transform(pretext)

# Transform the query using the same vectorizer
comprequery = vectorizer.transform(prequery)

# Compute cosine similarity
similarities = cosine_similarity(comprequery, tfidf_matrix)
similarities

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [158]:
nearest_neighbor_index = similarities.argmax()
nearest_neighbor = tfidf_matrix[nearest_neighbor_index]
similarity_score = similarities[0, nearest_neighbor_index]
nearest_neighbor_index,similarity_score

(25, 0.08354589894987596)

In [155]:
merged_strings[25]

'This paradigm lays bare the teleology of an ill reputation, which causes dishonor  and death. Conversely, a good standing leads to honorability and life. The notion of  “fama” (repute) as the springhead of life is unequivocally affirmed in Jorge Man- rique’s celebrated “Coplas a la muerte de su padre” (1477). This elemental ideal is  no mere literary conceit. It was an essential part of human relations and a core tenet  of social institutions. Rojas and Lorca not only understood these imperatives. They recognized the  seed for tragedy hidden beneath. They knew that their societies valued people not in  proportion with personal virtue or moral integrity, but with appearances and public  opinion. Guided by this sociological awareness, they sought to make their characters  reflect upon and live through these idiosyncrasies that rewarded repute with honor  and life, and punished dishonor with exclusion and death. Melibea consistently equates her loss of honra to the loss of her life. In a