In [2]:
import os
import sys
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

def load_book(file_path):
    if file_path.endswith('.pdf'):
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    else:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='ISO-8859-1') as file:
                text = file.read()
        return text

def split_text(text, chunk_size=512):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def embed_text(text_chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(text_chunks)
    return np.array(embeddings, dtype='float32')

def build_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

def retrieve_context(query, text_chunks, index, model_name="sentence-transformers/all-MiniLM-L6-v2", top_k=3):
    model = SentenceTransformer(model_name)
    query_embedding = model.encode([query]).astype('float32')
    
    distances, indices = index.search(query_embedding, top_k)
    
    valid_indices = [i for i in indices[0] if i < len(text_chunks)]
    
    return " ".join([text_chunks[i] for i in valid_indices])

def load_qa_model():
    model_name = "deepset/roberta-base-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    return pipeline("question-answering", model=model, tokenizer=tokenizer)

def answer_question(question, context, qa_pipeline):
    result = qa_pipeline(question=question, context=context)
    return result['answer']

def main(book_path, question):
    print("Loading book...")
    text = load_book(book_path)
    text_chunks = split_text(text)
    
    print("Generating embeddings...")
    embeddings = embed_text(text_chunks)
    
    print("Building FAISS index...")
    index = build_faiss_index(embeddings)
    
    print("Retrieving relevant context...")
    context = retrieve_context(question, text_chunks, index)
    
    print("Loading QA model...")
    qa_pipeline = load_qa_model()
    
    print("Answering question...")
    answer = answer_question(question, context, qa_pipeline)
    print(f"Q: {question}\nA: {answer}")

if __name__ == "__main__":
    book_path = "pride_and_prejudice.pdf"
    question = "Who is the main character?"
    main(book_path, question)


Loading book...
Generating embeddings...
Building FAISS index...
Retrieving relevant context...
Loading QA model...


Device set to use cpu


Answering question...
Q: Who is the main character?
A: Lady Catherine


In [1]:
import os
import sys
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

def load_book(file_path):
    if file_path.endswith('.pdf'):
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    else:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='ISO-8859-1') as file:
                text = file.read()
        return text

def split_text(text, chunk_size=512):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def embed_text(text_chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(text_chunks)
    return np.array(embeddings, dtype='float32')

def build_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

def retrieve_context(query, text_chunks, index, model_name="sentence-transformers/all-MiniLM-L6-v2", top_k=3):
    model = SentenceTransformer(model_name)
    query_embedding = model.encode([query]).astype('float32')
    
    distances, indices = index.search(query_embedding, top_k)
    
    valid_indices = [i for i in indices[0] if i < len(text_chunks)]
    
    return " ".join([text_chunks[i] for i in valid_indices])

def load_qa_model():
    model_name = "deepset/roberta-base-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    return pipeline("question-answering", model=model, tokenizer=tokenizer)

def answer_question(question, context, qa_pipeline):
    result = qa_pipeline(question=question, context=context)
    return result['answer']

def main(book_path, question):
    print("Loading book...")
    text = load_book(book_path)
    text_chunks = split_text(text)
    
    print("Generating embeddings...")
    embeddings = embed_text(text_chunks)
    
    print("Building FAISS index...")
    index = build_faiss_index(embeddings)
    
    print("Retrieving relevant context...")
    context = retrieve_context(question, text_chunks, index)
    
    print("Loading QA model...")
    qa_pipeline = load_qa_model()
    
    print("Answering question...")
    answer = answer_question(question, context, qa_pipeline)
    print(f"Q: {question}\nA: {answer}")

if __name__ == "__main__":
    book_path = "pride_and_prejudice.pdf"
    question = "Why is marriage such an important topic in the novel?"
    main(book_path, question)





Loading book...
Generating embeddings...
Building FAISS index...
Retrieving relevant context...
Loading QA model...


Device set to use cpu


Answering question...
Q: Why is marriage such an important topic in the novel?
A: forging the right unions


In [2]:
import os
import sys
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

def load_book(file_path):
    if file_path.endswith('.pdf'):
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    else:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='ISO-8859-1') as file:
                text = file.read()
        return text

def split_text(text, chunk_size=512):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def embed_text(text_chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(text_chunks)
    return np.array(embeddings, dtype='float32')

def build_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

def retrieve_context(query, text_chunks, index, model_name="sentence-transformers/all-MiniLM-L6-v2", top_k=3):
    model = SentenceTransformer(model_name)
    query_embedding = model.encode([query]).astype('float32')
    
    distances, indices = index.search(query_embedding, top_k)
    
    valid_indices = [i for i in indices[0] if i < len(text_chunks)]
    
    return " ".join([text_chunks[i] for i in valid_indices])

def load_qa_model():
    model_name = "deepset/roberta-base-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    return pipeline("question-answering", model=model, tokenizer=tokenizer)

def answer_question(question, context, qa_pipeline):
    result = qa_pipeline(question=question, context=context)
    return result['answer']

def main(book_path, question):
    print("Loading book...")
    text = load_book(book_path)
    text_chunks = split_text(text)
    
    print("Generating embeddings...")
    embeddings = embed_text(text_chunks)
    
    print("Building FAISS index...")
    index = build_faiss_index(embeddings)
    
    print("Retrieving relevant context...")
    context = retrieve_context(question, text_chunks, index)
    
    print("Loading QA model...")
    qa_pipeline = load_qa_model()
    
    print("Answering question...")
    answer = answer_question(question, context, qa_pipeline)
    print(f"Q: {question}\nA: {answer}")

if __name__ == "__main__":
    book_path = "pride_and_prejudice.pdf"
    question = "Where does the story of Pride and Prejudice take place?"
    main(book_path, question)


Loading book...
Generating embeddings...
Building FAISS index...
Retrieving relevant context...
Loading QA model...


Device set to use cpu


Answering question...
Q: Where does the story of Pride and Prejudice take place?
A: New Y ork, NY
