In [32]:
# https://medium.com/@mbrazel/open-source-self-hosted-rag-llm-server-with-chromadb-docker-ollama-7e6c6913da7a

import langchain_community
import langchain_text_splitters
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter

In [33]:
loader = PyPDFLoader("2404.pdf")
pages = loader.load_and_split()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(pages)
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [34]:
import uuid
import chromadb
from chromadb.config import Settings

client = chromadb.HttpClient(host='localhost', port=8000)
collection = client.get_or_create_collection(name="my_collection")
for doc in docs:
    collection.add(
        ids=[str(uuid.uuid1())], metadatas=doc.metadata, documents=doc.page_content
    )


In [35]:
import chromadb
from chromadb.config import Settings
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

def Extract_context(query):
    chroma_client = chromadb.HttpClient(host='localhost', port=8000,settings=Settings(allow_reset=True))
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma(
        client=chroma_client,
        collection_name="my_collection",
        embedding_function=embedding_function,
    )
    docs = db.similarity_search(query)
    fullcontent =''
    for doc in docs:
        fullcontent ='. '.join([fullcontent,doc.page_content])

    return fullcontent

In [36]:
def get_system_message_rag(content):
        return f"""You are an expert consultant helping executive advisors to get relevant information from internal documents.

        Generate your response by following the steps below:
        1. Recursively break down the question into smaller questions.
        2. For each question/directive:
            2a. Select the most relevant information from the context in light of the conversation history.
        3. Generate a draft response using selected information.
        4. Remove duplicate content from draft response.
        5. Generate your final response after adjusting it to increase accuracy and relevance.
        6. Do not try to summarise the answers, explain it properly.
        6. Only show your final response! 
        
        Constraints:
        1. DO NOT PROVIDE ANY EXPLANATION OR DETAILS OR MENTION THAT YOU WERE GIVEN CONTEXT.
        2. Don't mention that you are not able to find the answer in the provided context.
        3. Don't make up the answers by yourself.
        4. Try your best to provide answer from the given context.

        CONTENT:
        {content}
        """

In [37]:
def get_ques_response_prompt(question):
    return f"""
    ==============================================================
    Based on the above context, please provide the answer to the following question:
    {question}
    """

In [43]:
def generate_rag_response(content,question):
    client = Client(host='http://localhost:11434')
    stream = client.chat(model='mistral', messages=[
    {"role": "system", "content": get_system_message_rag(content)},            
    {"role": "user", "content": get_ques_response_prompt(question)}
    ],stream=True)
    # print(get_system_message_rag(content))
    # print(get_ques_response_prompt(question))
    print("####### THINKING OF ANSWER............ ")
    full_answer = ''
    for chunk in stream:
        print(chunk['message']['content'], end='', flush=True)
        full_answer =''.join([full_answer,chunk['message']['content']])

    return full_answer



In [54]:
query = 'Who is Vishal Babu'
context = Extract_context(query)
print(len(context))

8928


In [55]:
res = generate_rag_response(context ,query )
print(res)

####### THINKING OF ANSWER............ 
 The provided text does not contain information about "Vishal Babu". It primarily discusses the concepts of multi-head attention and compressive memory in the context of Transformer models. There is no mention or indication that Vishal Babu is associated with these topics or any other specific details about him. The provided text does not contain information about "Vishal Babu". It primarily discusses the concepts of multi-head attention and compressive memory in the context of Transformer models. There is no mention or indication that Vishal Babu is associated with these topics or any other specific details about him.


In [None]:
@app.route('/query', methods=['POST'])
def respond_to_query():
    if request.method == 'POST':
        data = request.get_json()
        # Assuming the query is sent as a JSON object with a key named 'query'
        query = data.get('query')
        # Here you can process the query and generate a response
        response = f'This is the response to your query:\n {get_reponse(query)}'
        return response
    
if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0')