In [5]:
!pip install chromadb
!pip install arxiv
!pip install requests
!pip install langchain
!pip install lanchain_chroma
!pip install langchain_community
!pip install langchain_openai
!pip install langchain-core
!langchain-text-splitters
!pip install pypdf
!pip install python-dotenv

import os
import requests
from dotenv import load_dotenv
import arxiv
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

load_dotenv()

pdf_url = "https://arxiv.org/abs/2210.03629"
local_path = "./react.pdf"

# Download the paper from Arxiv if it doesn't exist locally
if not os.path.exists(local_path):
    print("Downloading ReAct paper...")
    client = arxiv.Client()
    paper = next(client.results(arxiv.Search(id_list=["2210.03629"])))
    print(f"Downloading: {paper.title}")
    paper.download_pdf(filename=local_path, dirpath=".")
    print(f"PDF saved to {local_path}")

# Load and split PDF
documents = PyPDFLoader(local_path).load()
chunks = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
).split_documents(documents)

# Setup RAG chain
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=OpenAIEmbeddings(),
    persist_directory="db"
)

# Set up the prompt for documentsummarization
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that creates concise and accurate "
                "summaries of documents. Use the following context to create a summary. "
                "If you don't know the answer, just say that you don't know.\n\n"
                "Context: {context}\n\n"
                "Please provide a clear and concise summary of the document."),
])

# Create the retrieval chain
chain = create_retrieval_chain(
    vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3}),
    create_stuff_documents_chain(
        llm=ChatOpenAI(model="gpt-4o-mini", temperature=0),
        prompt=prompt
    )
)

# Generate summary
response = chain.invoke({
    "input": "Please provide a comprehensive summary of this document."
})
print("\nDocument Summary:")
print("-" * 50)
print(response["answer"])


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[31mERROR: Could not find a version that satisfies the requirement lanchain_chroma (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for lanchain_chroma[0m[31m
[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
zsh:1: command not found: langchain-text-splitters
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Downloading PDF...


invalid pdf header: b'\n\n\n\n\n'
EOF marker not found


PdfStreamError: Stream has ended unexpectedly