In [65]:
import logging
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import SystemMessage
from langchain_text_splitters import CharacterTextSplitter

def load_pdf_and_split(path):
    pdf_loader = PyPDFLoader(path)
    pdf_pages = pdf_loader.load_and_split()
    text_chunks = pdf_pages[0].page_content
    return text_chunks

def split_text_into_chunks(text_chunks, chunk_size):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size)
    chunks = text_splitter.create_documents([text_chunks])
    return chunks

def embed_documents(text_chunks, api_key, model):
    embedding_model = GoogleGenerativeAIEmbeddings(google_api_key=api_key, model=model)
    vectors = embedding_model.embed_documents(text_chunks)
    return vectors

def create_vector_store(chunks, embedding_model):
    db = Chroma.from_documents(chunks, embedding_model)
    db.persist()
    return db

def create_retriever(db_connection):
    retriever = db_connection.as_retriever(search_kwargs={"k": 5})
    return retriever

def build_rag_chain(retriever, chat_template, api_key, model):
    output_parser = StrOutputParser()
    rag_chain = (
        {"context": retriever | (lambda docs: "\n\n".join(doc.page_content for doc in docs)),
         "question": RunnablePassthrough()}
        | chat_template
        | (ChatGoogleGenerativeAI(google_api_key=api_key, model=model)
          | output_parser)
    )
    return rag_chain

def main():
    path = '/content/RAG System.pdf'
    text_chunks = load_pdf_and_split(path)
    chunks = split_text_into_chunks(text_chunks, chunk_size=300)
    with open("/content/api.txt") as f:
        api_key = f.read().strip()
    vectors = embed_documents(text_chunks, api_key, model="models/embedding-001")
    db = create_vector_store(chunks, GoogleGenerativeAIEmbeddings(google_api_key=api_key, model="models/embedding-001"))
    db_connection = Chroma(embedding_function=GoogleGenerativeAIEmbeddings(google_api_key=api_key, model="models/embedding-001"))
    retriever = create_retriever(db_connection)
    chat_template = ChatPromptTemplate.from_messages([
        SystemMessage(content="I'm a helpful AI assistant. I'll use the provided document to answer your questions."),
        HumanMessagePromptTemplate.from_template("""Answer the following question based on the provided context:

        Context:
        {context}

        Question:
        {question}

        Answer:""")
    ])
    model = "gemini-1.5-pro-latest"
    rag_chain = build_rag_chain(retriever, chat_template, api_key, model)
    user_question = input("Enter your question: ")
    logging.info(f"User question: {user_question}")
    response = rag_chain.invoke(user_question)
    print(response)

if __name__ == "__main__":
    main()


Enter your question: Can you explain the main contribution of the Leave No Context Behind paper?




## Leave No Context Behind: Main Contribution

The main contribution of the "Leave No Context Behind" paper is the introduction of **Infini-attention**, a novel attention mechanism designed to efficiently scale Transformer-based Large Language Models (LLMs) to handle infinitely long input sequences while maintaining bounded memory and computation requirements. 

Here's a breakdown of how Infini-attention achieves this:

* **Combines Local and Long-Term Attention:** Infini-attention integrates both masked local attention (focusing on recent context) and long-term linear attention (accessing information from the distant past) within a single Transformer block. This allows the model to capture both immediate and historical context effectively.
* **Compressive Memory:** The key innovation lies in incorporating a compressive memory system into the attention mechanism. Unlike traditional attention, which has memory requirements that grow quadratically with sequence length, compressive memory