In [1]:
import fitz  # PyMuPDF and nltk gensim used here
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vivam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vivam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


In [4]:
#to improve accuracy of test extraction 
def preprocess_text(text):
    sentences = nltk.sent_tokenize(text)  # Split text into sentences
    stop_words = set(nltk.corpus.stopwords.words('english'))  # Get stop words
    processed_sentences = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)  # Tokenize sentence into words
        words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
        processed_sentences.append(' '.join(words))  # Join words back into sentence
    return processed_sentences

In [5]:
def train_word2vec(processed_sentences):
    model = Word2Vec(sentences=processed_sentences, vector_size=100, window=5, min_count=1, workers=4)
    return model

In [6]:
def answer_question(question, text, processed_sentences):
    vectorizer = TfidfVectorizer().fit(processed_sentences)
    question_vec = vectorizer.transform([question])
    sentence_vectors = vectorizer.transform(processed_sentences)
    similarities = cosine_similarity(question_vec, sentence_vectors).flatten()
    best_match_index = similarities.argmax()
    return processed_sentences[best_match_index]

In [7]:
#for loading text and processing it
pdf_path = r"C:\Users\vivam\Downloads\mproj\mahcinelearning.pdf"
text = extract_text_from_pdf(pdf_path)
processed_sentences = preprocess_text(text)



In [8]:
# code for processing text and finding best suitable answer from the pdf itslef
def interactive_qa():
    print("PDF text loaded. You can now ask questions about its content.")
    while True:
        question = input("Enter your question (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        answer = answer_question(question, text, processed_sentences)
        print("Answer:", answer)

In [None]:
interactive_qa()

PDF text loaded. You can now ask questions about its content.
Enter your question (or type 'exit' to quit): supervised learning
Answer: supervised learning definition supervised learning algorithm learns labeled data desired output known
Enter your question (or type 'exit' to quit): reinforcement learning
Answer: reinforcement learning definition reinforcement learning teaches agent make sequential decisions trial error aiming maximize rewards
Enter your question (or type 'exit' to quit): unsupervised learning
Answer: unsupervised learning definition unsupervised learning involves training algorithms unlabeled data discover patterns structures
Enter your question (or type 'exit' to quit): deep learning
Answer: deep learning definition deep learning subset machine learning uses neural networks many layers analyze data
Enter your question (or type 'exit' to quit): conclusions
Answer: machine learning branch artificial intelligence ai enables computers learn data improve performance time 