Importing and downloading packages

In [3]:
import PyPDF2
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

Extracting text from pdf

In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

Lowecasing, removing punctuations, tokeninzing, removing stopwords, Lemmatizing tokens

In [6]:
def preprocess_text(text):
 
    text = text.lower()
    
   
    text = re.sub(r'[^\w\s]', '', text)
    
  
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
   
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

In [7]:
def ner_and_pos_tagging(tokens):

    pos_tags = nltk.pos_tag(tokens)
    

    named_entities = nltk.ne_chunk(pos_tags, binary=True)
    
    return named_entities, pos_tags

In [8]:
def generate_embeddings(text_chunks):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(text_chunks)
    return tfidf_matrix, tfidf_vectorizer

In [9]:
def semantic_search(query_vector, tfidf_matrix, text_chunks, top_n=3):
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_similar_indices = similarities.argsort()[::-1][:top_n]
    top_similar_chunks = [text_chunks[i] for i in top_similar_indices]
    return top_similar_chunks

In [12]:

pdf_path = "./Desktop/the_hitchhiker_s_guide_to_the_galaxy.pdf"
user_query = "Why are humans unhappy?"


pdf_text = extract_text_from_pdf(pdf_path)

text_chunks = sent_tokenize(pdf_text)

preprocessed_chunks = [' '.join(preprocess_text(chunk)) for chunk in text_chunks]


tfidf_matrix, tfidf_vectorizer = generate_embeddings(preprocessed_chunks)


preprocessed_query = ' '.join(preprocess_text(user_query))
query_vector = tfidf_vectorizer.transform([preprocessed_query])

top_similar_chunks = semantic_search(query_vector, tfidf_matrix, text_chunks)


In [13]:
for chunk in top_similar_chunks:
    print(chunk)
    

    tokens = word_tokenize(chunk)
    named_entities, pos_tags = ner_and_pos_tagging(tokens)
   


"What is your name, human?"
This planet has - or rather had - a problem, which was this: most 
of the people on it were unhappy for pretty much of the time.
I've just had an unhappy 
love affair, so I don't see why anybody else should have a good 
time.
