In [None]:
# Step 1:
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup

def extract_text_from_epub(epub_path):
    book = epub.read_epub(epub_path)
    text = ""
    
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            soup = BeautifulSoup(item.content, 'html.parser')
            text += soup.get_text() + "\n\n"
    
    return text

# Example Usage
epub_path = "/Users/sparshpatel/Documents/Codes/codes/pythonml/slm/Percy Jackson and the Olympians 5 - The Last Olympian by Rick Riordan (z-lib.org).epub"  # Replace with your actual file
book_text = extract_text_from_epub(epub_path)
print(book_text[:1000])  # Print a sample


In [None]:
# Step 2:

import nltk
from nltk.tokenize import sent_tokenize

# Download necessary NLTK data
nltk.download('punkt')

def split_into_chunks(text, max_chunk_size=1024):
    sentences = sent_tokenize(text)  # Split into sentences
    chunks = []
    chunk = ""

    for sentence in sentences:
        if len(chunk) + len(sentence) <= max_chunk_size:
            chunk += sentence + " "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + " "

    if chunk:
        chunks.append(chunk.strip())

    return chunks

# Apply chunking
chunks = split_into_chunks(book_text)
print(f"Total chunks: {len(chunks)}")
print(chunks[10:15])  # Print first 3 chunks for verification


In [None]:
# Another way to categorise chunks, may lead to different answers
# import re
# import nltk
# from nltk.tokenize import sent_tokenize

# # Download necessary NLTK data
# nltk.download('punkt')

# def clean_text(text):
#     """Removes excessive newlines while preserving structure and removes standalone author mentions."""
#     text = re.sub(r'\n+', '\n', text)  # Replace multiple newlines with a single one
#     text = re.sub(r'\bRick Riordan\b(?!\w)', '', text, flags=re.IGNORECASE)  # Remove only standalone "Rick Riordan"
#     text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
#     return text

# def split_into_chunks(text, max_chunk_size=1024):
#     text = clean_text(text)  # Clean text before chunking
#     sentences = sent_tokenize(text)  # Split into sentences
#     chunks = []
#     chunk = ""

#     for sentence in sentences:
#         if len(chunk) + len(sentence) <= max_chunk_size:
#             chunk += sentence + " "
#         else:
#             chunks.append(chunk.strip())
#             chunk = sentence + " "

#     if chunk:
#         chunks.append(chunk.strip())

#     return chunks

# # Apply cleaning and chunking
# cleaned_text = clean_text(book_text)
# chunks = split_into_chunks(cleaned_text)

# print(f"Total chunks: {len(chunks)}")
# print(chunks[:5])  # Print first 5 chunks for verification


In [3]:
# Step 3:
chunks = [chunk for chunk in chunks if chunk.strip()]  # Remove empty or whitespace-only chunks


In [None]:
# Step 4:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")  # Higher accuracy


In [None]:
# Step 5:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the same model we used before
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Convert text chunks into embeddings
chunk_embeddings = model.encode(chunks, convert_to_tensor=True)

print("Chunks successfully embedded!")


In [6]:
# Step 6:
from sentence_transformers import SentenceTransformer

# Load the embedding model
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Encode the chunks
chunk_embeddings = embedding_model.encode(chunks, convert_to_tensor=True)  # Returns a PyTorch tensor
chunk_embeddings = chunk_embeddings.cpu().numpy()  # Move to CPU and convert to NumPy

# Now rebuild the FAISS index
import faiss
index = faiss.IndexFlatL2(chunk_embeddings.shape[1])  
index.add(chunk_embeddings)  # No more errors!


In [7]:
# Step 7:
import faiss
index = faiss.IndexFlatL2(chunk_embeddings.shape[1])
index.add(chunk_embeddings)


In [8]:
# Step 8:
from torch.nn.functional import cosine_similarity
import torch

def find_best_chunk(question):
    # Encode the question
    question_embedding = model.encode(question, convert_to_tensor=True)
    
    # Compute similarity with all chunks
    similarities = cosine_similarity(question_embedding, chunk_embeddings)
    
    # Get the index of the most relevant chunk
    best_chunk_idx = torch.argmax(similarities).item()
    
    return chunks[best_chunk_idx]  # Return the most relevant chunk


In [None]:
# Step 9:
from transformers import pipeline

# Load the Question Answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

In [None]:
# Step 10:
def retrieve_best_chunk(question):
    question_embedding = embedding_model.encode([question], convert_to_numpy=True)
    _, indices = index.search(question_embedding, 1)
    return chunks[indices[0][0]]

def answer_question(question):
    best_chunk = retrieve_best_chunk(question)
    result = qa_pipeline(question=question, context=best_chunk)
    return result["answer"]

question = "Who is the god of underworld??"
print(answer_question(question))
