## Notes

## 1. RAG application using gemini model api

In [24]:
# !pip install -qU langchain-google-genai
# !pip install python-dotenv
# !pip install -qU langchain-core
# !pip install langchain-ollama 

In [25]:
import langchain
from dotenv import load_dotenv
import os
load_dotenv()

True

In [26]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("langchain_api_key")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [27]:
# Load the LLM model for the generation
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [28]:
# Prepare the embedding model for the vector similarity search
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

### 1.1 Selecting the vector store - We will try with the in-memory vector store of langchain

In [29]:
from langchain_core.vectorstores import InMemoryVectorStore
vector_store_1 = InMemoryVectorStore(embeddings)

### 1.2 Document Indexing

In [30]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
bs4_strainer = bs4.SoupStrainer(class_=("post"))

#### 1.2.1 Loading the document

In [31]:
loader = WebBaseLoader(web_path="https://jalammar.github.io/illustrated-transformer/",
                        bs_kwargs={"parse_only": bs4_strainer}
                        )
docs = loader.load()

In [32]:
assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")
print(docs[0].page_content[:500])

Total characters: 24876

The Illustrated Transformer

Discussions:
Hacker News (65 points, 4 comments), Reddit r/MachineLearning (29 points, 3 comments)


Translations: Arabic, Chinese (Simplified) 1, Chinese (Simplified) 2, French 1, French 2, Italian, Japanese, Korean, Persian, Russian, Spanish 1, Spanish 2, Vietnamese

Watch: MIT’s Deep Learning State of the Art lecture referencing this post

Featured in courses at Stanford, Harvard, MIT, Princeton, CMU and others





Update: This post has now become a book! Check 


#### 1.2.2 Split the document into chunks

In [33]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
split_chunks = text_splitter.split_documents(docs)

print(f"Split blog post into {len(split_chunks)} sub-documents.")

Split blog post into 37 sub-documents.


#### 1.2.3 Store and index the document chunks in the vector store

In [34]:
document_ids = vector_store_1.add_documents(documents=split_chunks)
print(document_ids[:3])

['334a24e4-07f1-461d-b591-06b6a750cee7', 'b60fd671-c82a-4fa2-9840-6885fab59bf5', 'b39c62e1-521e-4342-baa9-236319045f40']


### 1.3 Preparing the rag prompt

In [45]:
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder

In [46]:
contextualize_q_system_prompt1 = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt1),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [51]:
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""

contextualize_qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

### 1.4 Preparing the RAG pipeline

In [58]:
from langchain.chains import create_history_aware_retriever,create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage,AIMessage


def retrieve_and_generate(question,chat_history):
    retriever = vector_store_1.as_retriever()

    #create a retriever which can include the chat history to retrieve the context
    retriever_chain  = create_history_aware_retriever(
        llm, retriever, contextualize_q_prompt
        )
    #LLM chain which passes the list of Documents from the retriever to the LLM
    llm_answer_chain = create_stuff_documents_chain(llm,contextualize_qa_prompt)

    #Final chain which gives the output context from the retriever chain to the LLM chain
    retrieval_chain = create_retrieval_chain(retriever_chain, llm_answer_chain)

    # Retriever chain has two inputs - user question and chat history. The context is automatically passed from the retriever throgh the create retriever chain
    llm_response = retrieval_chain.invoke({"input": question, "chat_history": chat_history})

    #Add the user question and LLM response to the chat history
    chat_history.extend([HumanMessage(content=question), AIMessage(content=llm_response["answer"])])
    
    return llm_response["answer"]

In [59]:
chat_history = []
question = "Explain self attention?"
llm_response = retrieve_and_generate(question=question,chat_history = chat_history)
llm_response

"Self-attention allows a model to consider other parts of the input sequence when processing each word.  For each word, it creates query, key, and value vectors; the query vector's dot product with each key vector produces a score indicating relevance.  This weighting of other words helps create a richer encoding for the current word.\n"

In [60]:
question = "what is the advantages of it?"
llm_response = retrieve_and_generate(question=question,chat_history = chat_history)
llm_response

'Self-attention allows the model to weigh the importance of different words in a sentence when processing each word, leading to better understanding of context and relationships between words.  This is particularly useful for long sequences where RNNs might struggle to maintain context across many time steps.  Finally, it enables parallel processing, unlike recurrent models, speeding up training and inference.\n'