![Image.png](./images/MMR.png)
![image.png](./images/MMR_Algo.png)

In [1]:
from langchain.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings

embedding = OllamaEmbeddings(
    model="llama3.2:3b",
)

persist_directory = 'docs/chroma/'

In [2]:
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

  vectordb = Chroma(


In [3]:
print(vectordb._collection.count())

8


In [4]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [5]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [6]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [7]:
smalldb.similarity_search(question, k=2)

[Document(metadata={}, page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).'),
 Document(metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.')]

In [8]:
smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(metadata={}, page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).'),
 Document(metadata={}, page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.')]

In [9]:
question = "what is Yazat's college?"

In [10]:
docs_ss = vectordb.similarity_search(question,k=3)

In [11]:
docs_ss[0].page_content[:100]

'Internship.  \n \n…………………………………………………… \n    Associate Dean (SW) and FIC, T&P \nNational Institute of Te'

In [12]:
docs_ss[1].page_content[:100]

'DearHiringManager,\nI amwritingtoexpressmykeeninterest inopportunitieswithinyouresteemedorganization.'

In [13]:
docs_mmr = vectordb.max_marginal_relevance_search(question)

Number of requested results 20 is greater than number of elements in index 8, updating n_results = 8


In [14]:
docs_mmr[0].page_content[:100]

'Internship.  \n \n…………………………………………………… \n    Associate Dean (SW) and FIC, T&P \nNational Institute of Te'

In [15]:
docs_mmr[1].page_content[:100]

'DearHiringManager,\nI amwritingtoexpressmykeeninterest inopportunitieswithinyouresteemedorganization.'

In [16]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain_ollama.llms import OllamaLLM


In [17]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The information about a person named Yazat the chunk is from, should be one of `docs/Yazat_Resume.pdf`, `docs/RANK Certificate_Yazat Mishra.pdf`, `docs/Cover-Letter.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The information from the lecture",
        type="integer",
    ),
]

In [19]:
document_content_description = "Yazat's information"
llm = OllamaLLM(model='llama3.2:3b', temperature=0)

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose = True
)

In [29]:
question = "Where is Yazat studying?"


In [30]:
docs = retriever.get_relevant_documents(question)

In [31]:
for d in docs:
    print(d.metadata)

{'page': 0, 'source': 'docs/RANK Certificate_Yazat Mishra.pdf'}
{'page': 0, 'source': 'docs/RANK Certificate_Yazat Mishra.pdf'}


In [32]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [33]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [34]:
llm = OllamaLLM(model='llama3.2:3b', temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [35]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [37]:
question = "What is Yazat's CGPA?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

9.50
----------------------------------------------------------------------------------------------------
Document 2:

9.50


In [38]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [41]:
# Load PDF
loader = PyPDFLoader("docs/Yazat_Resume.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)


In [42]:
# Retrieve
svm_retriever = SVMRetriever.from_texts(splits,embedding)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [43]:
question = "What are Yazat's major skills?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]

Document(metadata={}, page_content='Yazat Mishra\n/githubGithub /codePortfolio /linkedinLinkedin /envel⌢peEmail ♂phone+918010582162\nEducation\nNational Institute of Technology Mizoram 2021 - Expected May 2025\nB.Tech in Electronics and Communication Current GPA: 9.50/10.0\nAmity International School, Noida 2021\nSenior Secondary School Percentage: 96%\nExperience\nSoftware Developer Intern June 2024 - September 2024\nNTRO, Government of India New Delhi\n• Developed models for anomaly detection in hyperspectral data using TensorFlow, PyTorch, and CNN and\nEncoder-Decoder architecture, with QGIS for geospatial analysis\n• Built an offline Q&A bot powered by LLaMA models (LlamaIndex, Ollama), integrating ChromaDB, Neo4j, Flask,\nand Streamlit\nProjects\nPDF AnswerBot\n | Flask, Python, Ollama, ChromaDB, Neo4j, Cypher, Streamlit, Llama-IndexJuly 2024 - Ongoing\n• Developed a PDF-based chatbot with intelligent querying and local processing to ensure privacy and security\nwithout requiring 

In [44]:
question = "Where is Yazat studying?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(metadata={}, page_content='Yazat Mishra\n/githubGithub /codePortfolio /linkedinLinkedin /envel⌢peEmail ♂phone+918010582162\nEducation\nNational Institute of Technology Mizoram 2021 - Expected May 2025\nB.Tech in Electronics and Communication Current GPA: 9.50/10.0\nAmity International School, Noida 2021\nSenior Secondary School Percentage: 96%\nExperience\nSoftware Developer Intern June 2024 - September 2024\nNTRO, Government of India New Delhi\n• Developed models for anomaly detection in hyperspectral data using TensorFlow, PyTorch, and CNN and\nEncoder-Decoder architecture, with QGIS for geospatial analysis\n• Built an offline Q&A bot powered by LLaMA models (LlamaIndex, Ollama), integrating ChromaDB, Neo4j, Flask,\nand Streamlit\nProjects\nPDF AnswerBot\n | Flask, Python, Ollama, ChromaDB, Neo4j, Cypher, Streamlit, Llama-IndexJuly 2024 - Ongoing\n• Developed a PDF-based chatbot with intelligent querying and local processing to ensure privacy and security\nwithout requiring 