<h2>PARAGRAPH SPLITTER WITHOUT RECURSIVE CHARACTER TEXT SPLITTER</h2>

In [None]:
import re
from langchain.document_loaders import PyPDFLoader

def split_by_paragraph(text):
    paragraphs = re.split(r'\n\s*\n+', text.strip())  # Splits on double newlines (paragraph breaks)
    return [p.strip() for p in paragraphs if p.strip()]  # Remove empty paragraphs

def clean_text(text):
    return re.sub(r'[\n\t\r]+', ' ', text)

# Load textbook (PDF)
loader = PyPDFLoader("The-Odyssey.pdf")
docs = loader.load()

# Clean and split each document into paragraphs
paragraph_chunks = []
for doc in docs:
    cleaned_text = clean_text(doc.page_content)
    paragraphs = split_by_paragraph(cleaned_text)
    for para in paragraphs:
        paragraph_chunks.append({"page_content": para, "metadata": doc.metadata})  # Preserve metadata

# Convert to LangChain Document objects
from langchain.schema import Document
chunks = [Document(page_content=para["page_content"], metadata=para["metadata"]) for para in paragraph_chunks]

<h2>SAVING THE CHUNKS AS A JSON FILE</h2>

In [None]:
import json

def save_chunks(chunks, filename="preprocessed_chunks.json"):
    chunk_dicts = [{"page_content": c.page_content, "metadata": c.metadata} for c in chunks]
    with open(filename, 'w') as f:
        json.dump(chunk_dicts, f)

# Run this once after preprocessing
save_chunks(chunks)

<h2>TESTING WITH VECTORIZING AFTER PARAGRAPH SPLITTER INSTEAD OF RECURSIVE CHARACTER TEXT SPLITTER</h2>

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
embedding_function = HuggingFaceEmbeddings(model_name="intfloat/e5-large-v2")
import tqdm

# Initialize Chroma vector store
vectorstore = Chroma(persist_directory="./chroma_db_pedition", embedding_function=embedding_function)

# Extract text from document chunks
texts = [doc.page_content for doc in chunks]
metadatas = [doc.metadata for doc in chunks]  # Optional: If your chunks have metadata

# Use tqdm to show progress while adding texts to ChromaDB
for i in tqdm(range(0, len(texts), 10), desc="Storing documents in ChromaDB", unit="batch"):
    vectorstore.add_texts(texts=texts[i:i+10], metadatas=metadatas[i:i+10])  # Adding in batches

# Persist the vector store to disk
vectorstore.persist()

# 9 MINUTE AND 38 SECONDS

<h2>LOADING THE SAVED CHUNKS AND THE CHROMA VECTOR DATABASE</h2>

In [28]:
def load_chunks(filename="preprocessed_chunks.json"):
    with open(filename, 'r') as f:
        chunk_dicts = json.load(f)
    return [Document(page_content=c["page_content"], metadata=c["metadata"]) for c in chunk_dicts]

# Load chunks instead of reprocessing PDF
chunks = load_chunks()

# Load the stored vector database
vectorstore = Chroma(persist_directory="./chroma_db_pedition", embedding_function=embedding_function)

<h2>USING BOTH VECTOR SIMILARITY AND KEYWORD SIMILARITY</h2>

In [29]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# vector retriever
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Keyword retriever (BM25)
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 5

# Hybrid ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.7, 0.3]  # tune if needed
)

<h2>HYBRID APPROACH OF USING HYBRID RETRIEVAL APPROACH FOLLOWED BY RE-RANKING</h2>

In [30]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np




# Load cross-encoder model
# rerank_model_name = "BAAI/bge-reranker-large" # USE THIS IF NEEDED A PRECISE RESULT
rerank_model_name = "BAAI/bge-reranker-base"  # USE THIS IF NEEDED A FASTER RESULT
tokenizer = AutoTokenizer.from_pretrained(rerank_model_name)
model = AutoModelForSequenceClassification.from_pretrained(rerank_model_name)

def rerank_chunks(query, chunks, top_k=3):
    pairs = [[query, chunk.page_content] for chunk in chunks]
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors="pt", max_length=512)

    with torch.no_grad():
        scores = model(**inputs).logits.view(-1).float()

    sorted_indices = scores.argsort(descending=True)
    top_chunks = [chunks[i] for i in sorted_indices[:top_k]]
    top_scores = scores[sorted_indices[:top_k]].numpy()  # Extract scores for top results

    return top_chunks, top_scores 





# RERANK WITH SENTENCE SPLITTER FOR  REDUCED TOKEN COUNT. BUT INTENSE COMPUTATION REQUIRED
# from nltk.tokenize import sent_tokenize
# from langchain.schema import Document
# import nltk
# nltk.download('punkt')

# def rerank_chunks(query, chunks, top_k=3):
#     # Split paragraphs into sentences while maintaining metadata
#     sentences = []
#     for chunk in chunks:
#         try:
#             chunk_sentences = sent_tokenize(chunk.page_content)
#         except:
#             # Fallback for simple sentence splitting if NLTK fails
#             chunk_sentences = chunk.page_content.split('. ')
        
#         for sent in chunk_sentences:
#             sentences.append(Document(
#                 page_content=sent.strip(),
#                 metadata=chunk.metadata  # Preserve original metadata
#             ))

#     # Create query-sentence pairs for scoring
#     pairs = [[query, doc.page_content] for doc in sentences]
#     inputs = tokenizer(pairs, padding=True, truncation=True, 
#                       return_tensors="pt", max_length=512)

#     with torch.no_grad():
#         scores = model(**inputs).logits.view(-1).float()

#     # Sort sentences by their relevance scores
#     sorted_indices = scores.argsort(descending=True)
#     top_sentences = [sentences[i] for i in sorted_indices[:top_k]]
#     top_scores = scores[sorted_indices[:top_k]].numpy()

#     return top_sentences, top_scores





def calculate_confidence(scores):
    """Takes the scores from `rerank_chunks` and applies sigmoid to get a confidence score."""
    probabilities = torch.sigmoid(torch.tensor(scores)).numpy()
    return float(np.max(probabilities))  # Return the highest confidence score

<h2>LOADING HUGGINGFACE MODELS</h2>

In [31]:
from langchain_community.llms import HuggingFaceHub
from AI_GATEWAYS import huggingface_api_key

llm = HuggingFaceHub(
    repo_id="deepseek-ai/DeepSeek-R1",
    # repo_id="meta-llama/Llama-3.3-70B-Instruct",
    model_kwargs={"temperature": 0.2, "max_length": 1024},
    huggingfacehub_api_token=huggingface_api_key
)

<h2>LLM IMPLEMENTATION USING MEMORY AND THRESHHOLD</h2>

In [32]:
from langchain.memory import ConversationBufferWindowMemory
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


# Create memory that retains last 3 exchanges
memory = ConversationBufferWindowMemory(
    k=10,
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"
)


qa_prompt = ChatPromptTemplate.from_template(
    """You are a helpful AI assistant. Answer the user's question based on the conversation history and the provided Odyssey context. 

    ### Chat History:
    {chat_history}

    ### Context:
    {context}

    ### Question: 
    {question}

    ### Answer:
    """
)


qa_chain = (
    {"context": lambda x: x["chunks"], 
     "question": lambda x: x["question"],
     "chat_history": lambda x: x["chat_history"]}
    | qa_prompt
    | llm
    | StrOutputParser()
)

In [33]:
def format_chunks(chunks):
    return "\n\n".join([f"Page {c.metadata['page']}: {c.page_content}" for c in chunks])


# Modified ask_question function with confidence scoring
def ask_question(question):
    # Retrieve context
    initial_chunks = ensemble_retriever.get_relevant_documents(question)
    final_chunks, relevance_scores = rerank_chunks(question, initial_chunks, top_k=3)
    
    # Calculate confidence
    confidence = calculate_confidence(relevance_scores)
    
    # Generate answer
    raw_answer = qa_chain.invoke({
        "question": question,
        "chunks": format_chunks(final_chunks),
        "chat_history": memory.load_memory_variables({})["chat_history"]
    })

    # Extract everything after "### Answer:"
    answer = raw_answer.split("### Answer:")[-1].strip()
    
    # Store interaction in memory
    memory.save_context({"question": question}, {"answer": answer})
    
    # Add confidence and sources
    sources = list(set(c.metadata["page"] for c in final_chunks))
    response = f"{answer}\n\nConfidence: {confidence:.0%}\nSources: Pages {', '.join(map(str, sources))}"
    
    return response

<h2>DEEPSEEK RESPONSE</h2>

In [None]:
print(ask_question("Who is Telemachus?"))

# TOOK ONLY 10.1 SECONDS WHEREAS THE OTHER WITH SENTENCE RERANKER TOOK 37 SECONDS
# SECOND RUN  8.9 SECONDS
# THIRD RUN  7 SECONDS



Telemachus is the son of Odysseus and Penelope. He is the main character in Homer's Odyssey, and the story follows his journey to find out what happened to his father, who has been missing for many years.

Confidence: 1%
Sources: Pages 184, 41, 186


In [None]:
print(ask_question("And what about his relationship with Odysseus?"))

# TOOK ONLY 17.7 SECONDS WHEREAS THE OTHER WITH SENTENCE RERANKER TOOK 42 SECONDS
# SECOND RUN  12.4 SECONDS
# THIRD RUN 6.3 SECONDS



Telemachus is the son of Odysseus and Penelope. He is the main character in Homer's Odyssey, and the story follows his journey to find out what happened to his father, who has been missing for many years.

Confidence: 26%
Sources: Pages 97, 292, 230


In [None]:
print(ask_question("Did Odysseus use a lightsaber?"))

# TOOK ONLY 10.4 SECONDS WHEREAS THE OTHER WITH SENTENCE RERANKER TOOK 57 SECONDS
# SECOND RUN  16.6 SECONDS
# THIRD RUN 6.5 SECONDS



No, Odysseus did not use a lightsaber. Lightsabers are a fictional weapon from the Star Wars universe, while Odysseus is a character from Homer's Odyssey, set in ancient Greece. Odysseus used a bow, arrows, and a sword in the story.

Confidence: 42%
Sources: Pages 224, 268, 79


In [None]:
print(ask_question("what is both of their relationship with penelope?"))



Telemachus is the son of Odysseus and Penelope. Odysseus is the husband of Penelope.

Confidence: 3%
Sources: Pages 279, 263, 255
