In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq

from langchain.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.messages import AIMessage


In [5]:
import os
from dotenv import load_dotenv 
load_dotenv()

True

In [6]:
## Step-1 Load pdf file
loader = PyPDFLoader('attention.pdf')
documents = loader.load()

## Step-2 split the docs 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
docs = text_splitter.split_documents(documents)

## Step-3 Embeddings 
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

## Step-4 Vectorstores
vector_store = FAISS.from_documents(docs,embeddings)
retriever_obj = vector_store.as_retriever()

## Step-5 LLM object
llm_obj = ChatGroq(
    model="llama-3.1-8b-instant",
    api_key=os.getenv("GROQ_API_KEY")
)

##  Step 6: Prompt with history
prompt = ChatPromptTemplate.from_template("""
Use the following context and chat history to answer the question.

Chat history:
{history}

Context:
{context}

Question:
{input}
""")

## Step 7: Build stuff + retrieval chain
qa_chain = create_stuff_documents_chain(llm_obj,prompt)
rag_chain = create_retrieval_chain(retriever_obj,qa_chain)

def convert_output_to_aimessage(output):
    return AIMessage(content=output["answer"])

final_rag_chain = rag_chain | convert_output_to_aimessage

## Step 8: Add RunnableWithMessageHistory 
stores = {}
def get_session_history(session_id: str) ->BaseChatMessageHistory:
    if session_id not in stores:
        stores[session_id] = ChatMessageHistory()
    return stores[session_id]
rag_with_history = RunnableWithMessageHistory(
    final_rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="history"
)
## Step 9: Create session ID and invoke rag with chatHistory

session_id = "session-1"
query1 = "What is attention in transformer?"

response1 = rag_with_history.invoke({"input":query1},config={"configurable":{"session_id":session_id}})

print("response-1:",response1)
print('')
query2 = "Explain again in shortly"
response2 = rag_with_history.invoke({"input":query2},config={"configurable":{"session_id":session_id}})


  return forward_call(*args, **kwargs)


response-1: content="In the context of the Transformer model architecture, attention refers to the multi-head self-attention mechanism used in both the encoder and decoder stacks. This mechanism allows the model to weigh the importance of different input elements and focus on the most relevant information when generating output.\n\nIn the Transformer, attention is used in three different ways:\n\n1. Self-attention: The model computes the attention weights among all elements in the input sequence to weigh their importance.\n2. Encoder-decoder attention: The decoder attends to the output of the encoder to generate the final output.\n3. Encoder-encoder attention: The model attends to its own output at different positions.\n\nThe Transformer uses multi-head attention, which involves splitting the input into multiple attention heads and processing them in parallel. Each head computes a weighted sum of the input elements, and the final output is a weighted sum of the outputs from all heads.\

  return forward_call(*args, **kwargs)


In [7]:
print(response2)

content='In the context of the Transformer model architecture, attention refers to the multi-head self-attention mechanism. It weighs the importance of different input elements and focuses on the most relevant information when generating output. \n\nThis mechanism is used in three ways:\n\n1. Self-attention: The model computes attention weights among all elements in the input sequence.\n2. Encoder-decoder attention: The decoder attends to the output of the encoder.\n3. Encoder-encoder attention: The model attends to its own output at different positions.\n\nSelf-attention is a key component of attention in the Transformer, which allows the model to model complex dependencies between input elements.' additional_kwargs={} response_metadata={}


In [8]:
print(stores)

{'session-1': InMemoryChatMessageHistory(messages=[HumanMessage(content='What is attention in transformer?', additional_kwargs={}, response_metadata={}), AIMessage(content="In the context of the Transformer model architecture, attention refers to the multi-head self-attention mechanism used in both the encoder and decoder stacks. This mechanism allows the model to weigh the importance of different input elements and focus on the most relevant information when generating output.\n\nIn the Transformer, attention is used in three different ways:\n\n1. Self-attention: The model computes the attention weights among all elements in the input sequence to weigh their importance.\n2. Encoder-decoder attention: The decoder attends to the output of the encoder to generate the final output.\n3. Encoder-encoder attention: The model attends to its own output at different positions.\n\nThe Transformer uses multi-head attention, which involves splitting the input into multiple attention heads and proc