In [130]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
embedding_function = HuggingFaceEmbeddings(model_name="intfloat/e5-large-v2")

In [131]:
# Load the stored vector database
vectorstore = Chroma(persist_directory="./chroma_db_pedition", embedding_function=embedding_function)

# Retrieve the top 3 most similar chunks for a user query
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
relevant_chunks = retriever.get_relevant_documents("What challenge did Penelope use to test the suitors, and how did Odysseus prove his identity?")


# THE BELOW IS THE CORRECT REPLY FOR THE ABOVE ASKED ONE
''' 
Penelope devised a challenge where the suitors had to string Odysseus' great bow and shoot an arrow through twelve axe heads. 
None of the suitors could accomplish the feat. However, when Odysseus, disguised as a beggar, attempted the challenge, 
he easily strung the bow and shot the arrow through all twelve axe heads, proving his true identity. 
'''

" \nPenelope devised a challenge where the suitors had to string Odysseus' great bow and shoot an arrow through twelve axe heads. \nNone of the suitors could accomplish the feat. However, when Odysseus, disguised as a beggar, attempted the challenge, \nhe easily strung the bow and shot the arrow through all twelve axe heads, proving his true identity. \n"

<h2>RESULTS AFTER USING PARAGRAPH SPLITTER</h2>

In [132]:
for chunk in relevant_chunks:
    print(chunk.page_content)  # Shows the most relevant textbook sections

but he laid his hand on my mouth, and in the fulness of his wisdom suffered me not to speak. But come with me and I will stake my life on it; and if I play thee false, do thou slay me by a death most pitiful.” Then wise Penelope made answer to her: “Dear nurse, it is hard for thee, how wise soever, to observe the purposes of the everlasting gods. None the less let us go to my child, that I may see the wooers dead, and him that slew them.” With that word she went down from the upper chamber, and much her heart debated, whether she should stand apart, and question her dear lord or draw nigh, and clasp and kiss his head and hands. But when she had come within and had crossed the threshold of stone, she sat down over against Odysseus, in the light of the fire, by the further wall. Now he was sitting by the tall pillar, looking down and waiting to know if perchance his noble wife would speak to him, when her eyes beheld him. But she sat long in silence, and amazement came upon her soul, and

In [133]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
embedding_function = HuggingFaceEmbeddings(model_name="intfloat/e5-large-v2")
from langchain.schema import Document
import json

<h2>LOADING THE SAVED CHUNKS AND THE CHROMA VECTOR DATABASE</h2>

In [134]:
def load_chunks(filename="preprocessed_chunks.json"):
    with open(filename, 'r') as f:
        chunk_dicts = json.load(f)
    return [Document(page_content=c["page_content"], metadata=c["metadata"]) for c in chunk_dicts]

# Load chunks instead of reprocessing PDF
chunks = load_chunks()

# Load the stored vector database
vectorstore = Chroma(persist_directory="./chroma_db_pedition", embedding_function=embedding_function)

# Retrieve the top 3 most similar chunks for a user query
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

<h2>USING BOTH VECTOR SIMILARITY AND KEYWORD SIMILARITY</h2>

In [135]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# Existing vector retriever
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Keyword retriever (BM25)
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 5

# Hybrid ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.7, 0.3]  # Tune based on data
)

relevant_chunks = ensemble_retriever.get_relevant_documents("who wrote odyssey?")

relevant_chunks

[Document(metadata={'page': 21, 'page_label': '22', 'source': 'The-Odyssey.pdf'}, page_content='The Odyssey'),
 Document(metadata={'page': 1, 'page_label': '2', 'source': 'The-Odyssey.pdf'}, page_content="The Project Gutenberg EBook of The Odyssey, by Homer This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you'll have to check the laws of the country where you are located before using this ebook. Title: The Odyssey Author: Homer Translator: Butcher & Lang Release Date: April, 1999 [EBook #1728] Last updated: April 16, 2020 Language: English *** START OF THIS PROJECT GUTENBERG EBOOK THE ODYSSEY *** Produced by Jim Tinsley"),
 Document(metadata={'page': 3, 'page_label': '4', 'source': 'The-Odys

In [136]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np



# Load cross-encoder model
rerank_model_name = "BAAI/bge-reranker-large"
tokenizer = AutoTokenizer.from_pretrained(rerank_model_name)
model = AutoModelForSequenceClassification.from_pretrained(rerank_model_name)



def rerank_chunks(query, chunks, top_k=3, max_context_tokens=600):
    """Token-aware reranking"""
    pairs = [[query, chunk.page_content] for chunk in chunks]
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
    with torch.no_grad():
        scores = model(**inputs).logits.view(-1).float()
    
    sorted_indices = scores.argsort(descending=True)
    selected_chunks = []
    token_count = 0
    
    for idx in sorted_indices:
        chunk = chunks[idx]
        chunk_content = f"Page {chunk.metadata['page']}: {chunk.page_content}"
        chunk_tokens = tokenizer.encode(chunk_content, add_special_tokens=False)
        
        if token_count + len(chunk_tokens) > max_context_tokens:
            continue  # Skip if over limit
            
        selected_chunks.append(chunk)
        token_count += len(chunk_tokens)
        
        if len(selected_chunks) >= top_k:
            break
            
    return selected_chunks, scores[sorted_indices[:len(selected_chunks)]]



def calculate_confidence(scores):
    if len(scores) == 0:
        return 0.0  # default confidence score if no relevant chunks were found
    
    probabilities = torch.sigmoid(torch.tensor(scores)).numpy()
    return float(np.max(probabilities))  # Return the highest confidence score


In [137]:
initial_chunks = ensemble_retriever.get_relevant_documents("who wrote odyssey?")
final_chunks, relevance_scores = rerank_chunks("who wrote odyssey?", initial_chunks, top_k=3)

confidence = calculate_confidence(relevance_scores)
final_chunks

  probabilities = torch.sigmoid(torch.tensor(scores)).numpy()


[Document(metadata={'page': 1, 'page_label': '2', 'source': 'The-Odyssey.pdf'}, page_content="The Project Gutenberg EBook of The Odyssey, by Homer This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you'll have to check the laws of the country where you are located before using this ebook. Title: The Odyssey Author: Homer Translator: Butcher & Lang Release Date: April, 1999 [EBook #1728] Last updated: April 16, 2020 Language: English *** START OF THIS PROJECT GUTENBERG EBOOK THE ODYSSEY *** Produced by Jim Tinsley"),
 Document(metadata={'page': 21, 'page_label': '22', 'source': 'The-Odyssey.pdf'}, page_content='The Odyssey'),
 Document(metadata={'page': 3, 'page_label': '4', 'source': 'The-Odys

In [138]:
initial_chunks = ensemble_retriever.get_relevant_documents("Explain the themes of hospitality in The Odyssey.")
final_chunks, relevance_scores = rerank_chunks("Explain the themes of hospitality in The Odyssey.", initial_chunks, top_k=3)
final_chunks

Token indices sequence length is longer than the specified maximum sequence length for this model (754 > 512). Running this sequence through the model will result in indexing errors


[Document(metadata={'page': 21, 'page_label': '22', 'source': 'The-Odyssey.pdf'}, page_content='The Odyssey'),
 Document(metadata={'source': 'The-Odyssey.pdf', 'page': 5, 'page_label': '6'}, page_content='As one that for a weary space has lain Lulled by the song of Circe and her wine In gardens near the pale of Proserpine, Where that Ææan isle forgets the main, And only the low lutes of love complain, And only shadows of wan lovers pine, As such an one were glad to know the brine Salt on his lips, and the large air again, So gladly, from the songs of modern speech Men turn, and see the stars, and feel the free Shrill wind beyond the close of heavy flowers, And through the music of the languid hours They hear like Ocean on a western beach The surge and thunder of the Odyssey. A. L.'),
 Document(metadata={'source': 'The-Odyssey.pdf', 'page': 9, 'page_label': '10'}, page_content='from the received view, and followed Mr. Raper, who, however, has not been able to read through the proof-shee

<h2>LOADING LLAMA3</h2>

In [139]:
from langchain_groq import ChatGroq
from AI_GATEWAYS import groq_api_key

llm = ChatGroq(
    # model_name = "deepseek-r1-distill-llama-70b",
    model_name = "llama3-70b-8192",
    # model_name = "mixtral-8x7b-32768",
    temperature=0,
    groq_api_key = groq_api_key
)

<h2>LLM IMPLEMENTATION USING MEMORY AND THRESHHOLD</h2>

In [140]:
from langchain.memory import ConversationBufferWindowMemory
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


# Create memory that retains last 3 exchanges
memory = ConversationBufferWindowMemory(
    k=8,
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"
)


# Modified QA Chain with Memory
qa_prompt = ChatPromptTemplate.from_template(
    """Answer based on our conversation history or the Odyssey context. Not from external knowledge:
    
    Chat History:
    {chat_history}
    
    Context:
    {context}
    
    Question: {question}
    
    Answer in complete sentences, citing text evidence. 
    If unsure, say so:"""
)


qa_chain = (
    {"context": lambda x: x["chunks"], 
     "question": lambda x: x["question"],
     "chat_history": lambda x: x["chat_history"]}
    | qa_prompt
    | llm
    | StrOutputParser()
)

In [141]:
def format_chunks(chunks):
    return "\n\n".join([f"Page {c.metadata['page']}: {c.page_content}" for c in chunks])


def get_threshold_response():
    """Response when confidence is below threshold"""
    return "I'm not entirely confident about this answer. Would you like to rephrase or ask about another topic?"

In [142]:
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
import torch

# Load lightweight embedding model for fast sentence filtering
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

nltk.download('punkt')


def extract_relevant_sentences(query, chunks, top_k=3):
    """Splits paragraphs into sentences and retrieves the most relevant ones using embeddings & reranking."""
    question_embedding = embedding_model.encode(query, convert_to_tensor=True)
    sentence_scores = []

    for chunk in chunks:
        sentences = sent_tokenize(chunk.page_content)  # Split paragraph into sentences
        
        for sentence in sentences:
            sentence_embedding = embedding_model.encode(sentence, convert_to_tensor=True)
            score = util.pytorch_cos_sim(question_embedding, sentence_embedding).item()  # Compute similarity
            sentence_scores.append((sentence, score))

    # Sort sentences by similarity score
    sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
    
    # Keep only top-N sentences before reranking
    filtered_sentences = [sent for sent, score in sorted_sentences[:10]]  # Keep more before reranking

    # Check if there are any sentences to rerank
    if not filtered_sentences:
        return "No relevant sentences found."  # Early return if no sentences

    # Create rerank pairs
    rerank_pairs = [[query, sentence] for sentence in filtered_sentences]

    # Debugging: Print rerank_pairs to see if it contains invalid entries
    print(f"Rerank Pairs: {rerank_pairs}")  # Debugging: Ensure correct format

    try:
        inputs = tokenizer(rerank_pairs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    except Exception as e:
        print(f"Error during tokenization: {e}")
        return "Error during tokenization."

    with torch.no_grad():
        scores = model(**inputs).logits.view(-1).float()

    sorted_indices = scores.argsort(descending=True)
    top_sentences = [filtered_sentences[i] for i in sorted_indices[:top_k]]

    return "\n\n".join(top_sentences) if top_sentences else "No relevant sentences found."



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anandhu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [143]:
def ask_question(question, confidence_threshold=0.65):
    # Retrieve context (Top-K Chunks)
    initial_chunks = ensemble_retriever.get_relevant_documents(question)
    final_chunks, relevance_scores = rerank_chunks(question, initial_chunks, top_k=3)

    # Calculate confidence
    confidence = calculate_confidence(relevance_scores)

    # Extract only the most relevant sentences
    optimized_context = extract_relevant_sentences(question, final_chunks, top_k=3)

    # Generate answer
    answer = qa_chain.invoke({
        "question": question,
        "chunks": optimized_context,  # Sending only relevant sentences
        "chat_history": memory.load_memory_variables({})["chat_history"]
    })

    # Store interaction in memory
    memory.save_context({"question": question}, {"answer": answer})

    # Add confidence and sources
    sources = list(set(c.metadata["page"] for c in final_chunks))
    response = f"{answer}\n\nConfidence: {confidence:.0%}\nSources: Pages {', '.join(map(str, sources))}"

    return response if confidence >= confidence_threshold else f"{get_threshold_response()}\n\n{response}"

In [144]:
print(ask_question("Who is Telemachus?"))

  probabilities = torch.sigmoid(torch.tensor(scores)).numpy()


Rerank Pairs: [['Who is Telemachus?', 'Telemachus landed, goes first to Eumaeus.'], ['Who is Telemachus?', 'And she found Telemachus, and the glorious son of Nestor, couched at the vestibule of the house of famous Menelaus.'], ['Who is Telemachus?', 'Pallas sends home Telemachus from Lacedaemon with the presents given him by Menelaus.'], ['Who is Telemachus?', 'The son of Nestor truly was overcome with soft sleep, but sweet sleep gat not hold of Telemachus, but, through the night divine, careful thoughts for his father kept him wakeful.'], ['Who is Telemachus?', 'And grey-eyed Athene stood nigh him and spake to him, saying: “Telemachus, it is no longer meet that thou shouldest wander far from thy home, leaving thy substance behind thee, and men in thy house so wanton, lest they divide and utterly devour all thy wealth, and thou shalt have gone on a vain journey.'], ['Who is Telemachus?', 'Now Pallas Athene went to the wide land of Lacedaemon, to put the noble son of the great-hearted O

In [145]:
print(ask_question("And what about his relationship with Odysseus?"))

I'm not entirely confident about this answer. Would you like to rephrase or ask about another topic?

According to the provided chat history, Telemachus is the noble son of the great-hearted Odysseus.

Confidence: 0%
Sources: Pages 


In [146]:
print(ask_question("Did Odysseus use a lightsaber?"))

I'm not entirely confident about this answer. Would you like to rephrase or ask about another topic?

There is no mention of Odysseus using a lightsaber in our conversation history or the provided context. In fact, the concept of a lightsaber is not even mentioned at all.

Confidence: 0%
Sources: Pages 


In [147]:
print(ask_question("What challenge did Penelope use to test the suitors?"))

  probabilities = torch.sigmoid(torch.tensor(scores)).numpy()


Rerank Pairs: [['What challenge did Penelope use to test the suitors?', 'And as when a man hath hidden away a brand in the black embers at an upland farm, one that hath no neighbours nigh, and so saveth the seed of fire, that he may not have to seek a light otherwhere, even so did Odysseus cover him with the leaves.'], ['What challenge did Penelope use to test the suitors?', 'Through these the force of the wet winds blew never, neither did the bright sun light on it with his rays, nor could the rain pierce through, so close were they twined either to other; and thereunder crept Odysseus and anon he heaped together with his hands a broad couch; for of fallen leaves there was great plenty, enough to cover two or three men in winter time, however hard the weather.'], ['What challenge did Penelope use to test the suitors?', 'And the steadfast goodly Odysseus beheld it and rejoiced, and he laid him in the midst thereof and flung over him the fallen leaves.'], ['What challenge did Penelope u

In [148]:
print(ask_question("How did Odysseus prove his identity to Penelope?"))

  probabilities = torch.sigmoid(torch.tensor(scores)).numpy()


Rerank Pairs: [['How did Odysseus prove his identity to Penelope?', 'Odysseus  maketh  himself  known  to  Penelope,  tells  his adventures briefly, and in the morning goes to Laertes and makes himself known to him.'], ['How did Odysseus prove his identity to Penelope?', 'Odysseus hath come, and hath got him to his own house, though late hath he come, and hath slain the proud wooers that troubled his house, and devoured his substance, and oppressed his child.” Then wise Penelope answered her: “Dear nurse, the gods have made thee distraught, the gods that can make foolish even the wisdom of the wise, and that stablish the simple in understanding.'], ['How did Odysseus prove his identity to Penelope?', 'Never yet have I slept so sound since the day that Odysseus went forth to see that evil Ilios, never to be named.'], ['How did Odysseus prove his identity to Penelope?', 'Then the ancient woman went up into the upper chamber laughing aloud, to tell her mistress how her dear lord was withi

In [149]:
print(ask_question("What role does Athena play in Odysseus' journey?"))

  probabilities = torch.sigmoid(torch.tensor(scores)).numpy()


Rerank Pairs: [["What role does Athena play in Odysseus' journey?", 'But when he was now about to enter the pleasant city, then the goddess, grey-eyed Athene, met him, in the fashion of a young maiden carrying a pitcher, and she stood over against him, and goodly Odysseus inquired of her: “My child, couldst thou not lead me to the palace of the lord Alcinous, who bears sway among this people?'], ["What role does Athena play in Odysseus' journey?", 'At that same hour Odysseus roused him to go to the city, and Athene shed a deep mist about Odysseus for the favour that she bare him, lest any of the Phaeacians, high of heart, should meet him and mock him in sharp speech, and ask him who he was.'], ["What role does Athena play in Odysseus' journey?", 'Odysseus being received at the house of the king Alcinous, the queen after supper, taking notice of his garments, gives him occasion to relate his passage thither on the raft.'], ["What role does Athena play in Odysseus' journey?", 'So he pray

In [150]:
print(ask_question("What happens when Odysseus returns to Ithaca?"))

  probabilities = torch.sigmoid(torch.tensor(scores)).numpy()


Rerank Pairs: [['What happens when Odysseus returns to Ithaca?', 'Athene again urges the release of Odysseus; and Hermes is sent to bid Calypso let the hero go.'], ['What happens when Odysseus returns to Ithaca?', 'They reach Pylos, and are kindly received by the aged Nestor, who has no news about Odysseus.'], ['What happens when Odysseus returns to Ithaca?', 'Zeus prophecies that after twenty days sailing, Odysseus will reach Scheria, and the hospitable'], ['What happens when Odysseus returns to Ithaca?', 'On this same day (the sixth) the wooers in Ithaca learned that Telemachus had really set out to “cruise after his father.” They sent some of their number to lie in ambush for him, in a certain strait which he was likely to pass on his return to Ithaca.'], ['What happens when Odysseus returns to Ithaca?', 'He had heard from Proteus, the Old Man of the Sea, that Odysseus was alive, and a captive on an island of the deep.'], ['What happens when Odysseus returns to Ithaca?', 'Menelaus t

In [151]:
print(ask_question("What is the significance of the bow in The Odyssey?"))

KeyboardInterrupt: 

In [None]:
print(ask_question("Who are the suitors in The Odyssey?"))

In [None]:
print(ask_question("What is the Cyclops' name in The Odyssey?"))

In [None]:
print(ask_question("How does Odysseus escape from the Cyclops' cave?"))

In [None]:
print(ask_question("What happens to the men who eat the Lotus flowers in The Odyssey?"))