In [92]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceHub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import sys
import os

In [106]:
!pip install chromadb -q
!pip install langchain-community -q
!pip install ollama langchain-community -q
!pip install fastembed -q
!pip install langchain -q

In [95]:
def getDoc():
    #Loading the doc
    loader=PyPDFLoader("Vector_Database.pdf")
    pages=loader.load_and_split() #We split the doc since it is log and continous DOC

    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(pages)
    print(f"Split{len(pages)} documents into {len(chunks)} chunks." )
    embedding = FastEmbedEmbeddings()
    #Cretaing Vetor Stores
    vector_store=Chroma.from_documents(documents=chunks, embedding=embedding, persist_directory= "./chroma_db")
    vector_store.persist()  # Save to disk
    print("Vector store created and persisted.")
    return vector_store

In [96]:
getDoc()

Split16 documents into 16 chunks.
Vector store created and persisted.


<langchain_community.vectorstores.chroma.Chroma at 0x2c5ac0b4190>

In [97]:
from huggingface_hub import login
access_token_read= "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"
access_token_write="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"
login(token=access_token_read)

In [108]:
!pip install -U langchain-huggingface -q
!pip install -U langchain-ollama -q
# !pip install HuggingFaceHub -q


In [99]:
def rag_chain():
    model = ChatOllama(model="qwen2:0.5b") # a light weight meodel (0.5b), pulled from Ollama
    #
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context. 
        If you don't know the answer, then reply, No Context availabel for this question {input}. [/Instructions] </s> 
        [Instructions] Question: {input} 
        Context: {context} 
        Answer: [/Instructions]
        """
    )
    #Load the vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma( persist_directory="./chroma_db", embedding_function=embedding)

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs=
        {   "k": 3,
            "score_threshold": 0.5,
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)
    #
    return chain

In [100]:
print("Test")

Test


In [101]:
def ask(query: str):
    #
    chain = rag_chain()
    # invoke chain
    result = chain.invoke({"input": query})
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])

In [103]:
getDoc()
ask("What is Vector Database?")

Split16 documents into 16 chunks.
Vector store created and persisted.
Vector databases store a variety of structured data types that can be organized and indexed using a hierarchical structure. They are designed to enable fast searching and retrieval of information based on specific criteria such as keywords or user queries. They typically contain images, text files, audio recordings, video content, complex and unstructured digital objects, among other formats.
Source:  Vector_Database.pdf
Source:  Vector_Database.pdf
Source:  Vector_Database.pdf


In [104]:
getDoc()
ask("what are Vectors Embeddings? ")

Split16 documents into 16 chunks.
Vector store created and persisted.
Vectors Embeddings are learned embeddings that capture the characteristics of data based on an array of numbers. They can have hundreds or thousands of dimensions, and they use specialized models to learn the most relevant information from the data. The embeddings provide a multi-dimensional representation of data, allowing for easier analysis and visualization.
Source:  Vector_Database.pdf
Source:  Vector_Database.pdf
Source:  Vector_Database.pdf


In [105]:
getDoc()
ask("who is baba Azam") #out of context


Split16 documents into 16 chunks.
Vector store created and persisted.


No relevant docs were retrieved using the relevance score threshold 0.5


baba Azam is a well-known Indian politician and writer from the state of Jharkhand, known for his contributions to education in India. He is also associated with various political parties and has been active in politics since 1954. Baba Azam is also a Nobel Prize laureate in Physiology or Medicine.
