In [36]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
embedding_function = HuggingFaceEmbeddings(model_name="intfloat/e5-large-v2")
from langchain.schema import Document
import json

<h2>LOADING THE SAVED CHUNKS AND THE CHROMA VECTOR DATABASE</h2>

In [37]:
def load_chunks(filename="preprocessed_chunks.json"):
    with open(filename, 'r') as f:
        chunk_dicts = json.load(f)
    return [Document(page_content=c["page_content"], metadata=c["metadata"]) for c in chunk_dicts]

# Load chunks instead of reprocessing PDF
chunks = load_chunks()

# Load the stored vector database
vectorstore = Chroma(persist_directory="./chroma_db_pedition", embedding_function=embedding_function)

<h2>USING BOTH VECTOR SIMILARITY AND KEYWORD SIMILARITY</h2>

In [38]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# vector retriever
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Keyword retriever (BM25)
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 5

# Hybrid ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.7, 0.3]  # tune if needed
)

<h2>HYBRID APPROACH OF USING HYBRID RETRIEVAL APPROACH FOLLOWED BY RE-RANKING</h2>

In [39]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np


# Load cross-encoder model
# rerank_model_name = "BAAI/bge-reranker-large" # USE THIS IF NEEDED A PRECISE RESULT
rerank_model_name = "BAAI/bge-reranker-base"  # USE THIS IF NEEDED A FASTER RESULT
tokenizer = AutoTokenizer.from_pretrained(rerank_model_name)
model = AutoModelForSequenceClassification.from_pretrained(rerank_model_name)


from nltk.tokenize import sent_tokenize
from langchain.schema import Document
import nltk
nltk.download('punkt')

def rerank_chunks(query, chunks, top_k=3):
    # Split paragraphs into sentences while maintaining metadata
    sentences = []
    for chunk in chunks:
        try:
            chunk_sentences = sent_tokenize(chunk.page_content)
        except:
            # Fallback for simple sentence splitting if NLTK fails
            chunk_sentences = chunk.page_content.split('. ')
        
        for sent in chunk_sentences:
            sentences.append(Document(
                page_content=sent.strip(),
                metadata=chunk.metadata  # Preserve original metadata
            ))

    # Create query-sentence pairs for scoring
    pairs = [[query, doc.page_content] for doc in sentences]
    inputs = tokenizer(pairs, padding=True, truncation=True, 
                      return_tensors="pt", max_length=512)

    with torch.no_grad():
        scores = model(**inputs).logits.view(-1).float()

    # Sort sentences by their relevance scores
    sorted_indices = scores.argsort(descending=True)
    top_sentences = [sentences[i] for i in sorted_indices[:top_k]]
    top_scores = scores[sorted_indices[:top_k]].numpy()

    return top_sentences, top_scores


def calculate_confidence(scores):
    """Takes the scores from `rerank_chunks` and applies sigmoid to get a confidence score."""
    probabilities = torch.sigmoid(torch.tensor(scores)).numpy()
    return float(np.max(probabilities))  # Return the highest confidence score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anandhu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<h2>LOADING HUGGINGFACE MODELS</h2>

In [29]:
# from langchain_community.llms import HuggingFaceHub
# from AI_GATEWAYS import huggingface_api_key

# llm = HuggingFaceHub(
#     repo_id="meta-llama/Llama-3.3-70B-Instruct",
#     model_kwargs={"temperature": 0.2, "max_length": 1024},
#     huggingfacehub_api_token=huggingface_api_key
# )

In [40]:
from langchain_community.llms import HuggingFaceHub
from AI_GATEWAYS import huggingface_api_key

llm = HuggingFaceHub(
    repo_id="deepseek-ai/DeepSeek-R1",
    model_kwargs={"temperature": 0.2, "max_length": 1024},
    huggingfacehub_api_token=huggingface_api_key
)

<h2>LLM IMPLEMENTATION USING MEMORY AND THRESHHOLD</h2>

In [41]:
from langchain.memory import ConversationBufferWindowMemory
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


# Create memory that retains last 3 exchanges
memory = ConversationBufferWindowMemory(
    k=10,
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"
)


qa_prompt = ChatPromptTemplate.from_template(
    """You are a helpful AI assistant. Answer the user's question based on the conversation history and the provided Odyssey context. 

    ### Chat History:
    {chat_history}

    ### Context:
    {context}

    ### Question: 
    {question}

    ### Answer:
    """
)



qa_chain = (
    {"context": lambda x: x["chunks"], 
     "question": lambda x: x["question"],
     "chat_history": lambda x: x["chat_history"]}
    | qa_prompt
    | llm
    | StrOutputParser()
)

In [42]:
def format_chunks(chunks):
    return "\n\n".join([f"Page {c.metadata['page']}: {c.page_content}" for c in chunks])



def get_threshold_response():
    return "I'm not entirely confident about this answer. Would you like to rephrase or ask about another topic?"



# Modified ask_question function with confidence scoring
def ask_question(question, confidence_threshold=0.65):
    # Retrieve context
    initial_chunks = ensemble_retriever.get_relevant_documents(question)
    final_chunks, relevance_scores = rerank_chunks(question, initial_chunks, top_k=3)
    
    # Calculate confidence
    confidence = calculate_confidence(relevance_scores)
    
    # Generate answer
    raw_answer = qa_chain.invoke({
        "question": question,
        "chunks": format_chunks(final_chunks),
        "chat_history": memory.load_memory_variables({})["chat_history"]
    })

    # Extract everything after "### Answer:"
    answer = raw_answer.split("### Answer:")[-1].strip()
    
    # Store interaction in memory
    memory.save_context({"question": question}, {"answer": answer})
    
    # Add confidence and sources
    sources = list(set(c.metadata["page"] for c in final_chunks))
    response = f"{answer}\n\nConfidence: {confidence:.0%}\nSources: Pages {', '.join(map(str, sources))}"
    
    return response if confidence >= confidence_threshold else f"{get_threshold_response()}\n\n{response}"

<h2>LLAMA RESPONSE</h2>

In [None]:
# print(ask_question("Who is Telemachus?"))



I'm not entirely confident about this answer. Would you like to rephrase or ask about another topic?

Telemachus is the son of Odysseus and Penelope in Homer's Odyssey. He is the main character in the first four books of the Odyssey, where he sets out on a journey to find news of his father and to establish his own identity as a hero.

Confidence: 1%
Sources: Pages 96, 282, 195


In [None]:
print(ask_question("And what about his relationship with Odysseus?"))



# BEFORE USING THE SPLIT FUNCTION TO RETURN THE ANSWER
# Human: You are a helpful AI assistant. Answer the user's question based on the conversation history and the provided Odyssey context. 

#     ### Chat History:
#     [HumanMessage(content='Who is Telemachus?', additional_kwargs={}, response_metadata={}), AIMessage(content="Human: You are a helpful AI assistant. Answer the user's question based on the conversation history and the provided Odyssey context. \n\n    ### Chat History:\n    []\n\n    ### Context:\n    Page 282: Who has set my bed otherwhere?\n\nPage 96: Who gave thee this raiment?\n\nPage 195: Howbeit, Olympian Zeus, that dwells in the clear sky, knows hereof, whether or no he will fulfill for them the evil day before their marriage.” Now even as he spake, a bird flew out on the right, a hawk, the swift messenger of Apollo.\n\n    ### Question: \n    Who is Telemachus?\n\n    ### Answer:\n    Respond in complete sentences and cite text evidence. If unsure, say so.\n     Telemachus is the son of Odysseus and Penelope.", additional_kwargs={}, response_metadata={})]

#     ### Context:
#     Page 292: Then he communed with his heart and soul, whether he should fall on his father’s neck and kiss him, and tell him all, how he had returned and come to his own country, or whether he should first question him and prove him in every word.

# Page 260: Would ye stand on the side of the wooers or of Odysseus?

# Page 281: Meanwhile, the house-dame Eurynome had bathed the great-hearted Odysseus within his house, and anointed him with olive-oil, and cast about him a goodly mantle and a doublet.

#     ### Question: 
#     And what about his relationship with Odysseus?

#     ### Answer:
#      Telemachus is the son of Odysseus and Penelope.

# Confidence: 98%
# Sources: Pages 281, 292, 260



Telemachus' relationship with Odysseus is central to the Odyssey. When the epic begins, Telemachus is a young man, around 20 years old, who has never known his father, as Odysseus has been away for 20 years fighting in the Trojan War. Telemachus is initially portrayed as timid and inexperienced, but he grows in courage and confidence throughout the story. He sets out on a journey to find news of his father and to establish his own identity as a hero. When Odysseus finally returns to Ithaca, he is initially disguised as a beggar, and Telemachus does not recognize him. However, they eventually reunite, and Odysseus reveals his true identity to Telemachus. Their relationship is one of love, respect, and mutual understanding, as they work together to reclaim their kingdom from the suitors who have been pursuing Penelope.

Confidence: 98%
Sources: Pages 281, 292, 260


In [None]:
print(ask_question("Did Odysseus use a lightsaber?"))


# I'm not entirely confident about this answer. Would you like to rephrase or ask about another topic?

# Human: You are a helpful AI assistant. Answer the user's question based on the conversation history and the provided Odyssey context. 

#     ### Chat History:
#     [HumanMessage(content='Who is Telemachus?', additional_kwargs={}, response_metadata={}), AIMessage(content="Human: You are a helpful AI assistant. Answer the user's question based on the conversation history and the provided Odyssey context. \n\n    ### Chat History:\n    []\n\n    ### Context:\n    Page 282: Who has set my bed otherwhere?\n\nPage 96: Who gave thee this raiment?\n\nPage 195: Howbeit, Olympian Zeus, that dwells in the clear sky, knows hereof, whether or no he will fulfill for them the evil day before their marriage.” Now even as he spake, a bird flew out on the right, a hawk, the swift messenger of Apollo.\n\n    ### Question: \n    Who is Telemachus?\n\n    ### Answer:\n    Respond in complete sentences and cite text evidence. If unsure, say so.\n     Telemachus is the son of Odysseus and Penelope.", additional_kwargs={}, response_metadata={}), HumanMessage(content='And what about his relationship with Odysseus?', additional_kwargs={}, response_metadata={}), AIMessage(content='Human: You are a helpful AI assistant. Answer the user\'s question based on the conversation history and the provided Odyssey context. \n\n    ### Chat History:\n    [HumanMessage(content=\'Who is Telemachus?\', additional_kwargs={}, response_metadata={}), AIMessage(content="Human: You are a helpful AI assistant. Answer the user\'s question based on the conversation history and the provided Odyssey context. \\n\\n    ### Chat History:\\n    []\\n\\n    ### Context:\\n    Page 282: Who has set my bed otherwhere?\\n\\nPage 96: Who gave thee this raiment?\\n\\nPage 195: Howbeit, Olympian Zeus, that dwells in the clear sky, knows hereof, whether or no he will fulfill for them the evil day before their marriage.” Now even as he spake, a bird flew out on the right, a hawk, the swift messenger of Apollo.\\n\\n    ### Question: \\n    Who is Telemachus?\\n\\n    ### Answer:\\n    Respond in complete sentences and cite text evidence. If unsure, say so.\\n     Telemachus is the son of Odysseus and Penelope.", additional_kwargs={}, response_metadata={})]\n\n    ### Context:\n    Page 292: Then he communed with his heart and soul, whether he should fall on his father’s neck and kiss him, and tell him all, how he had returned and come to his own country, or whether he should first question him and prove him in every word.\n\nPage 260: Would ye stand on the side of the wooers or of Odysseus?\n\nPage 281: Meanwhile, the house-dame Eurynome had bathed the great-hearted Odysseus within his house, and anointed him with olive-oil, and cast about him a goodly mantle and a doublet.\n\n    ### Question: \n    And what about his relationship with Odysseus?\n\n    ### Answer:\n    Respond in complete sentences and cite text evidence. If unsure, say so.\n     Telemachus is the son of Odysseus and Penelope.', additional_kwargs={}, response_metadata={})]

#     ### Context:
#     Page 268: Thence he took out four shields and eight spears, and four helmets of bronze, with thick plumes of horse hair, and he started to bring them and came quickly to his father.

# Page 12: Odysseus was the King of Ithaca, a small and rugged island on the western coast of Greece.

# Page 268: As for him he girt his fourfold shield about his shoulders and bound on his mighty head a well wrought helmet, with horse hair crest, and terribly the plume waved aloft.

#     ### Question: 
#     Did Odysseus use a lightsaber?

#     ### Answer:
#      No, Odysseus did not use a lightsaber. The Odyssey describes Odysseus as using a spear, shield, and helmet, but there is no mention of a lightsaber (Page 268: "four shields and eight spears, and four helmets of bronze").

# Confidence: 37%
# Sources: Pages 268, 12



I'm not entirely confident about this answer. Would you like to rephrase or ask about another topic?

No, Odysseus did not use a lightsaber. Lightsabers are a fictional weapon from the Star Wars universe, while Odysseus is a character from Homer's Odyssey, set in ancient Greece. The context provided includes descriptions of Odysseus' shield, helmet, and spear, but no mention of a lightsaber.

Confidence: 37%
Sources: Pages 268, 12


<h2>DEEPSEEK RESPONSE</h2>

In [43]:
print(ask_question("Who is Telemachus?"))



I'm not entirely confident about this answer. Would you like to rephrase or ask about another topic?

Telemachus is the son of Odysseus and Penelope in Homer's Odyssey. He is the main character in the first four books of the Odyssey, where he sets out on a journey to find news of his father and to establish his own identity as a hero.

Confidence: 1%
Sources: Pages 96, 282, 195


In [44]:
print(ask_question("And what about his relationship with Odysseus?"))



Telemachus' relationship with Odysseus is central to the Odyssey. When the epic begins, Telemachus is a young man, around 20 years old, who has never known his father, as Odysseus has been away for 20 years fighting in the Trojan War. Telemachus is initially portrayed as timid and inexperienced, but he grows in courage and confidence throughout the story. He sets out on a journey to find news of his father and to establish his own identity as a hero. When Odysseus finally returns to Ithaca, he is initially disguised as a beggar, and Telemachus does not recognize him. However, they eventually reunite, and Odysseus reveals his true identity to Telemachus. Their relationship is one of love, respect, and mutual understanding, as they work together to reclaim their kingdom from the suitors who have been pursuing Penelope.

Confidence: 98%
Sources: Pages 281, 292, 260


In [45]:
print(ask_question("Did Odysseus use a lightsaber?"))



I'm not entirely confident about this answer. Would you like to rephrase or ask about another topic?

No, Odysseus did not use a lightsaber. Lightsabers are a fictional weapon from the Star Wars universe, while Odysseus is a character from Homer's Odyssey, set in ancient Greece. The context provided includes descriptions of Odysseus' shield, helmet, and spear, but no mention of a lightsaber.

Confidence: 37%
Sources: Pages 268, 12
