In [1]:
import langchain_ollama
import os

from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA

from langchain_core.prompts import PromptTemplate

In [2]:
# set model
# run ollama serve for local API instance
# currently running llama 3.2 3b
embeddings = langchain_ollama.OllamaEmbeddings(model = "llama3.2")

llm = langchain_ollama.ChatOllama(
    model = "llama3.2",
    temperature = 0.0,
    num_predict = 512, # max number of tokens to generate
    )

db_directory = "./test_chroma_db"

## Set up  the Vectorstore Retriever

In [16]:
# check if the folder db_directory already exists. If not, creat it and load the documents into the vector store. Else, use the existing vector store.
if not os.path.exists(db_directory):
    # load documents
    loader = PyPDFDirectoryLoader(
    "./test_data/")
    docs = loader.load() # metadata tracks paper and page number; each page is a single document

    # optional step: split the docs into smaller chunks to fit into context window of the model (model dependant, necessary for small models) -!! test this, shorter chunks may lead to bad retrieval results !!-
    #           potential remedy: use whole pages, but use the model to summarise each page before chaining it into the context
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # chunk size (characters)
        chunk_overlap=200,  # chunk overlap (characters)
        add_start_index=True,  # track index in original document
        )
    docs = text_splitter.split_documents(docs)

    # make doc vector store. as the vector store can get quite large (and takes time to initialize in memory), we use a chroma database to store the vectors    
    vector_store = Chroma(
        collection_name="lit_helper_test",
        embedding_function=embeddings,
        persist_directory=db_directory,  # save data locally
        )
    vector_store.add_documents(docs) # add docs

else:
    vector_store = Chroma(
        collection_name="lit_helper_test",
        embedding_function=embeddings,
        persist_directory=db_directory,  # save data locally
    )


docs_returned = 6 # number of docs returned by the retriever(s)

# turn the vector store into retriever(s)
retriever_mmr = vector_store.as_retriever(
    search_type="mmr", # MMR (Maximal Marginal Relevance) aims to diversify search results. the amount of diversification is set via the lambda_mult parameter
    search_kwargs={"k": docs_returned, "fetch_k": 30, "lambda_mult": 0.8}, # make sure the number of documents passed (k) fits into the context window
)

retriever_similarity = vector_store.as_retriever(
    search_type="similarity", # similarity score; optionally with threshold ("similarity_score_threshold" with "score_threshold" kwarg)
    search_kwargs={"k": docs_returned}, # make sure the number of documents passed (k) fits into the context window
)

In [17]:
# test the mmr retriever
for doc in retriever_mmr.invoke("Habermas"):
    print(f'{doc.metadata["source"]} p.{doc.metadata["page"]}:\n{doc.page_content} \n\n')

test_data\Jungherr & Schroeder 2021 Digital Transformations of the Public Arena.pdf p.82:
About the Series
Cambridge Elements in Politics and Communication publishes research focused on the
intersection of media, technology, and politics. The series emphasizes forward-looking
reviews of theﬁeld, path-breaking theoretical and methodological innovations, and the
timely application of social-scientiﬁc theory and methods to current developments in
politics and communication around the world.
https://doi.org/10.1017/9781009064484
 Published online by Cambridge University Press 


test_data\Fraser 1990 Rethinking the Public Sphere.pdf p.25:
that draw on concepts of mdtissage, see Gloria Anzaldda, Borderlands: La Frontera (1987) and 
Frangoise Lionnet, Autobiographical Voices: Race, Gender, Self-Portraiture (Ithaca NY: Comell 
University Press, 1989). 
29. In these respects, the concept of a public differs from that of a community. "Community" suggests a bounded and fairly homogeneous group, 

In [18]:
# test the similarity retriever
for doc in retriever_similarity.invoke("Habermas"):
    print(f'{doc.metadata["source"]} p.{doc.metadata["page"]}:\n{doc.page_content} \n\n')

test_data\Barberá et al 2019 Who Leads Who Follows.pdf p.7:
Democratic and Republican supporters, attentive pub-
lics, and the general public are to the expressed agenda of
Republicans and Democrats in Congress over the two-
year period studied. Higher coefﬁcients indicate that
groups tend to discuss the same issues.
These initial results show potential for corroborating
the presence of political responsiveness at the issue
attention level, and they seem to indicate that some
responsiveness models have a stronger explanatory
power than others. In particular, these results provide
stronger support for the Supporter and, to a lesser
extent, the Attentive models, than for the Downsian
argument. There is a positive, and in some cases large,
correlation between the agenda of members of Con-
gress and the issues discussed by their constituents.
Nevertheless, when paying attention to the coefﬁcients
for speciﬁc groups, we observe the highest correlations
to be between members and their party 

In [44]:
len(retriever_similarity.invoke("Habermas")[0].page_content)

956

## Build a RAG Chain

In [37]:
template = """You are a helpful assistant for finding relevant text passages in scientific literature. 
Use the following pieces of retrieved context to answer the question. Each piece of content starts with the indicators "Source" and "Page". Always provide these in your answer when using one of the retrieved passages.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}

Helpful answer:"""

rag_prompt = PromptTemplate.from_template(template)

Overkill for a test case, but we'll use LangGraph just in case we want to reuse these components later or deploy this application in some other context (e.g. as a stream)

In [39]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


def retrieve(state: State):
    retrieved_docs = retriever_similarity.invoke(state["question"])
    return {"context": retrieved_docs}
    

def generate(state: State):
    docs_content = "\n\n".join(f'Source: {doc.metadata["source"]}\nPage:{doc.metadata["page"]}\n{doc.page_content}' for doc in state["context"])
    messages = rag_prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [40]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [42]:
q = "What is the definition of the public sphere?"

result = graph.invoke({"question": q})

for doc in result["context"]:
    print(f'{doc.metadata["source"]} p.{doc.metadata["page"]}:\n{doc.page_content} \n\n')

print(f'Answer: {result["answer"]}')

test_data\Barberá et al 2019 Who Leads Who Follows.pdf p.7:
Democratic and Republican supporters, attentive pub-
lics, and the general public are to the expressed agenda of
Republicans and Democrats in Congress over the two-
year period studied. Higher coefﬁcients indicate that
groups tend to discuss the same issues.
These initial results show potential for corroborating
the presence of political responsiveness at the issue
attention level, and they seem to indicate that some
responsiveness models have a stronger explanatory
power than others. In particular, these results provide
stronger support for the Supporter and, to a lesser
extent, the Attentive models, than for the Downsian
argument. There is a positive, and in some cases large,
correlation between the agenda of members of Con-
gress and the issues discussed by their constituents.
Nevertheless, when paying attention to the coefﬁcients
for speciﬁc groups, we observe the highest correlations
to be between members and their party 

## RAG with Summarisation Step

In [53]:
template = """You are a helpful assistant for finding relevant text passages in scientific literature. 
Use the following summaries of retrieved text passages to answer the question. Each summary starts with the indicators "Source" and "Page". Always provide these in your answer when using one of the retrieved passages.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}

Helpful answer:"""

rag_prompt = PromptTemplate.from_template(template)

In [55]:
summarise_template = """Summarise the following text passage with regard to this question: {question}

{text}

Summary:"""

summarise_prompt = PromptTemplate.from_template(summarise_template)

# f'Source: {source}\nPage: {page}\nSummary:{summary}'

In [144]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    text: str
    summaries: List[Document]
    context: List[Document]
    answer: str


def retrieve(state: State):
    retrieved_docs = retriever_similarity.invoke(state["question"])
    return {"context": retrieved_docs}
    

def summarise(state: State):
    summaries = []
    for doc in state["context"]:
        text = doc.page_content
        messages = summarise_prompt.invoke({"question": state["question"], "text": text})
        summary = Document( # store the summary as a Document type object for consistency
            page_content = llm.invoke(messages).content,
            metadata = {"source": doc.metadata["source"], "page": doc.metadata["page"]})
        summaries.append(summary)
    return {"summaries": summaries}


def generate(state: State):
    summaries = "\n\n".join(f'Source: {doc.metadata["source"]}\nPage:{doc.metadata["page"]}\n{doc.page_content}' for doc in state["summaries"])
    messages = rag_prompt.invoke({"question": state["question"], "context": summaries})
    response = llm.invoke(messages)
    return {"answer": response.content}


In [145]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, summarise, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [153]:
q = "What is the definition of the public sphere?"

result = graph.invoke({"question": q})

for i in range(len(result["context"])):
    print(f'{result["context"][i].metadata["source"]} p.{result["context"][i].metadata["page"]}.\nText:\n{result["context"][i].page_content}\nSummary:\n{result["summaries"][i].page_content}\n\n')

print(f'Answer: {result["answer"]}')


test_data\Barberá et al 2019 Who Leads Who Follows.pdf p.7.
Text:
Democratic and Republican supporters, attentive pub-
lics, and the general public are to the expressed agenda of
Republicans and Democrats in Congress over the two-
year period studied. Higher coefﬁcients indicate that
groups tend to discuss the same issues.
These initial results show potential for corroborating
the presence of political responsiveness at the issue
attention level, and they seem to indicate that some
responsiveness models have a stronger explanatory
power than others. In particular, these results provide
stronger support for the Supporter and, to a lesser
extent, the Attentive models, than for the Downsian
argument. There is a positive, and in some cases large,
correlation between the agenda of members of Con-
gress and the issues discussed by their constituents.
Nevertheless, when paying attention to the coefﬁcients
for speciﬁc groups, we observe the highest correlations
to be between members and their 