In [1]:
import langchain_ollama
import os

from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA

from langchain_core.prompts import PromptTemplate

In [2]:
# set model
# run ollama serve for local API instance
# currently running llama 3.2 3b
embeddings = langchain_ollama.OllamaEmbeddings(model = "llama3.2")

llm = langchain_ollama.ChatOllama(
    model = "llama3.2",
    temperature = 0.0,
    num_predict = 512, # max number of tokens to generate
    )

db_directory = "./test_chroma_db"

## Set up  the Vectorstore Retriever

In [3]:
# check if the folder db_directory already exists. If not, creat it and load the documents into the vector store. Else, use the existing vector store.
if not os.path.exists(db_directory):
    # load documents
    loader = PyPDFDirectoryLoader(
    "./test_data/")
    docs = loader.load() # metadata tracks paper and page number; each page is a single document

    # optional step: split the docs into smaller chunks to fit into context window of the model (model dependant, necessary for small models) -!! test this, shorter chunks may lead to bad retrieval results !!-
    #           potential remedy: use whole pages, but use the model to summarise each page before chaining it into the context
    # text_splitter = RecursiveCharacterTextSplitter(
    #     chunk_size=1000,  # chunk size (characters)
    #     chunk_overlap=200,  # chunk overlap (characters)
    #     add_start_index=True,  # track index in original document
    #     )
    # docs = text_splitter.split_documents(docs)

    # make doc vector store. as the vector store can get quite large (and takes time to initialize in memory), we use a chroma database to store the vectors    
    vector_store = Chroma(
        collection_name="lit_helper_test",
        embedding_function=embeddings,
        persist_directory=db_directory,  # save data locally
        )
    vector_store.add_documents(docs) # add docs

else:
    vector_store = Chroma(
        collection_name="lit_helper_test",
        embedding_function=embeddings,
        persist_directory=db_directory,  # save data locally
    )


# turn the vector store into a retriever
retriever = vector_store.as_retriever(
    search_type="mmr", # MMR (Maximal Marginal Relevance) aims to diversify search results. the amount of diversification is set via the lambda_mult parameter
    search_kwargs={"k": 6, "fetch_k": 30, "lambda_mult": 0.3}, # make sure the number of documents passed (k) fits into the context window
)

In [4]:
retriever.invoke("Habermas") # test the retriever

[Document(metadata={'page': 5, 'source': 'test_data\\Jungherr & Schroeder 2021 Digital Transformations of the Public Arena.pdf'}, page_content='Contents\n1 Digital Transformations of the Public Arena 1\n2 The Public Arena: A Deﬁnition 3\n3 Variations 12\n4 Tensions 23\n5 Areas of Contention 48\n6 The Public Arena: Conditions, Consequences, and\nResponsibilities 53\n7 Coda 61\nReferences 63\nhttps://doi.org/10.1017/9781009064484\n Published online by Cambridge University Press'),
 Document(metadata={'page': 67, 'source': 'test_data\\Jungherr & Schroeder 2021 Digital Transformations of the Public Arena.pdf'}, page_content='scholars to overestimate the importance of the phenomenon of communication.\nSociology and political science are deeply attuned to the structures of social life,\nbut they neglect– or in any case have few tools to analyze– the role of media\nand communication. We have combined the two and presented an argument that\nputs both communications and its structural constrain

In [11]:
len(retriever.invoke("Habermas")[0].page_content)

3679

## Build a RAG Chain

In [5]:
template = """You are a helpful assistant for finding relevant text passages in scientific literature. 
Use the following pieces of retrieved context to answer the question. When using the retrieved context, you will always provide the source and page number. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}

Helpful answer:"""

rag_prompt = PromptTemplate.from_template(template)

Overkill for a test case, but we'll use LangGraph just in case we want to reuse these components later or deploy this application in some other context (e.g. as a stream)

In [6]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


def retrieve(state: State):
    retrieved_docs = retriever.invoke(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = rag_prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [7]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [8]:
q = "What is the definition of the public sphere?"

result = graph.invoke({"question": q})

print(f'Context: {result["context"]}\n\n')
print(f'Answer: {result["answer"]}')

Context: [Document(metadata={'page': 5, 'source': 'test_data\\Jungherr & Schroeder 2021 Digital Transformations of the Public Arena.pdf'}, page_content='Contents\n1 Digital Transformations of the Public Arena 1\n2 The Public Arena: A Deﬁnition 3\n3 Variations 12\n4 Tensions 23\n5 Areas of Contention 48\n6 The Public Arena: Conditions, Consequences, and\nResponsibilities 53\n7 Coda 61\nReferences 63\nhttps://doi.org/10.1017/9781009064484\n Published online by Cambridge University Press'), Document(metadata={'page': 36, 'source': 'test_data\\Jungherr & Schroeder 2021 Digital Transformations of the Public Arena.pdf'}, page_content='newsworthy information and feature it in their outlets. These top-down selec-\ntion decisions followed journalistic norms and brand identity. Today, these\ndecisions to feature stories in the news are important but additional processes\ndetermine which information gets ampliﬁed algorithmically and socially on\ndigital platforms. This has added an element of ins