In [2]:
import langchain_ollama
import os

from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA

from langchain_core.prompts import PromptTemplate

In [3]:
# set model
# run ollama serve for local API instance
# currently running llama 3.2 3b
embeddings = langchain_ollama.OllamaEmbeddings(model = "llama3.2")

llm = langchain_ollama.ChatOllama(
    model = "llama3.2",
    temperature = 0.0,
    num_predict = 512, # max number of tokens to generate
    )

## Set up  the Vectorstore Retriever

In [48]:
# check if the folder db_directory already exists. If not, creat it and load the documents into the vector store. Else, use the existing vector store. Makes a new DB for shortened docs

db_directory = "./test_chroma_db" # directory to save the vector store

short_docs = True # shorten document from pages to chunks? (set chunk size below)

retriever_type = "mmr" # "similarity"  or "mmr"

if short_docs:
    db_directory += "_short"

if not os.path.exists(db_directory):
    # load documents
    loader = PyPDFDirectoryLoader(
    "./test_data/")
    docs = loader.load() # metadata tracks paper and page number; each page is a single document

    # optional step: split the docs into smaller chunks to fit into context window of the model (model dependant, necessary for small models) -!! test this, shorter chunks may lead to bad retrieval results !!-
    #           potential remedy: use whole pages, but use the model to summarise each page before chaining it into the context
    if short_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,  # chunk size (characters)
            chunk_overlap=200,  # chunk overlap (characters)
            add_start_index=True,  # track index in original document
            )
        docs = text_splitter.split_documents(docs)

    # make doc vector store. as the vector store can get quite large (and takes time to initialize in memory), we use a chroma database to store the vectors    
    vector_store = Chroma(
        collection_name="lit_helper_test",
        embedding_function=embeddings,
        persist_directory=db_directory,  # save data locally
        )
    vector_store.add_documents(docs) # add docs

else:
    vector_store = Chroma(
        collection_name="lit_helper_test",
        embedding_function=embeddings,
        persist_directory=db_directory,  # save data locally
    )


docs_returned = 6 # number of docs returned by the retriever(s)

# turn the vector store into retriever(s)

if retriever_type == "mmr":
    retriever = vector_store.as_retriever(
        search_type="mmr", # MMR (Maximal Marginal Relevance) aims to diversify search results. the amount of diversification is set via the lambda_mult parameter
        search_kwargs={"k": docs_returned, "fetch_k": 30, "lambda_mult": 0.8}, # make sure the number of documents passed (k) fits into the context window
    )

if retriever_type == "similarity":
    retriever = vector_store.as_retriever(
        search_type="similarity", # similarity score; optionally with threshold ("similarity_score_threshold" with "score_threshold" kwarg)
        search_kwargs={"k": docs_returned}, # make sure the number of documents passed (k) fits into the context window
    )

In [5]:
# test the retriever
for doc in retriever.invoke("Habermas"):
    print(f'{doc.metadata["source"]} p.{doc.metadata["page"]}:\n{doc.page_content} \n\n')

test_data\Dahlberg 2011 Re-Constructing Digital Democracy.pdf p.2:
Dahlberg 857
third element refers to the specific ways in which digital media is seen to be supporting 
the development of the other two elements. I have chosen the term affordances as it 
broadly captures how all the positions tend to understand the human–technology 
relationship. In general terms, the relationship is one where the technology is seen to 
have certain features that enable (afford) particular democratic uses and outcomes. 1 
Finally, I explored how digital democracy commentary and practice was positioned in 
relation to these three immanent, interrelated, and mutually effecting elements. This 
enabled me to reconstruct  a number of positions, including the four outlined in this 
article.2 As interpretatively reconstructed representations of the articulations of empirical 
instances, the resulting positions are open to (and indeed invite) challenge and rewriting 
in digital democracy scholarship. 
The emp

In [None]:
len(retriever.invoke("Habermas")[0].page_content)

4646

## Build a RAG Chain

In [49]:
template = """You are a helpful assistant for finding relevant text passages in scientific literature. 
Use the following pieces of retrieved context to answer the question. Each piece of content starts with the indicators "Source" and "Page". Always provide these in your answer when using one of the retrieved passages.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}

Helpful answer:"""

rag_prompt = PromptTemplate.from_template(template)

In [50]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str
    prompt: str


def retrieve(state: State):
    retrieved_docs = retriever.invoke(state["question"])
    return {"context": retrieved_docs}
    

def generate(state: State):
    docs_content = "\n\n".join(f'Source: {doc.metadata["source"]}\nPage:{doc.metadata["page"]}\n{doc.page_content}' for doc in state["context"])
    messages = rag_prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content,
            "prompt": messages} # return the prompt for debugging purposes

In [51]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [52]:
q = "What is the definition of the public sphere?"

result = graph.invoke({"question": q})

for doc in result["context"]:
    print(f'{doc.metadata["source"]} p.{doc.metadata["page"]}:\n{doc.page_content} \n\n')

print(f'Answer: {result["answer"]}')

test_data\Dahlberg 2011 Re-Constructing Digital Democracy.pdf p.14:
Dahlberg L (2001a) Democracy via cyberspace: Examining the rhetorics and practices of three 
prominent camps. New Media & Society 3(2): 187–207. 
Dahlberg L (2001b) The internet and democratic discourse: Exploring the prospects of online 
deliberative forums extending the public sphere. Information, Communication & Society 4(4): 
615–633. 


test_data\Dahlberg 2011 Re-Constructing Digital Democracy.pdf p.15:
Graham T (2009) What’s wife swap got to do with it? Talking politics in the net-based public sphere. 
Doctoral Dissertation, University of Amsterdam, Amsterdam School of Communications.
Habermas J (1984) The Theory of Communicative Action  (trans. Thomas McCarthy). (V ol. 1, 
Reason and the Rationalization of Society). Boston, MA: Beacon Press. 


test_data\Jungherr & Schroeder 2021 Digital Transformations of the Public Arena.pdf p.78:
Party to the Anti-Trump Resistance. New York: Oxford University Press.
Slater, D

In [53]:
print(f'Prompt passed to the model after retreival and summarisation:\n{result["prompt"]} \n\nLength of the prompt: {len(str(result["prompt"]))}') 

Prompt passed to the model after retreival and summarisation:
text='You are a helpful assistant for finding relevant text passages in scientific literature. \nUse the following pieces of retrieved context to answer the question. Each piece of content starts with the indicators "Source" and "Page". Always provide these in your answer when using one of the retrieved passages.\nIf you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nSource: test_data\\Dahlberg 2011 Re-Constructing Digital Democracy.pdf\nPage:14\nDahlberg L (2001a) Democracy via cyberspace: Examining the rhetorics and practices of three \nprominent camps. New Media & Society 3(2): 187–207. \nDahlberg L (2001b) The internet and democratic discourse: Exploring the prospects of online \ndeliberative forums extending the public sphere. Information, Communication & Society 4(4): \n615–633.\n\nSource: test_data\\Dahlberg 2011 Re-Constructing Digital Democracy.pdf\nPage:15\nGraham T (2009

### With structured Output

In [54]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict, Annotated


class AnswerWithSources(TypedDict):
    """An answer to the question, with sources."""

    answer: str
    sources: Annotated[
        List[str],
        ...,
        "List of sources used to answer the question",
    ]


class State(TypedDict):
    question: str
    context: List[Document]
    answer: AnswerWithSources
    prompt: str


def retrieve(state: State):
    retrieved_docs = retriever.invoke(state["question"])
    return {"context": retrieved_docs}
    

def generate(state: State):
    docs_content = "\n\n".join(f'Source: {doc.metadata["source"]}\nPage:{doc.metadata["page"]}\n{doc.page_content}' for doc in state["context"])
    messages = rag_prompt.invoke({"question": state["question"], "context": docs_content})
    structured_llm = llm.with_structured_output(AnswerWithSources)
    response = structured_llm.invoke(messages)
    return {"answer": response,
            "prompt": messages} # return the prompt for debugging purposes

In [55]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [56]:
q = "What is the definition of the public sphere?"

result = graph.invoke({"question": q})

for doc in result["context"]:
    print(f'{doc.metadata["source"]} p.{doc.metadata["page"]}:\n{doc.page_content} \n\n')

print(f'Answer: {result["answer"]}')

test_data\Dahlberg 2011 Re-Constructing Digital Democracy.pdf p.14:
Dahlberg L (2001a) Democracy via cyberspace: Examining the rhetorics and practices of three 
prominent camps. New Media & Society 3(2): 187–207. 
Dahlberg L (2001b) The internet and democratic discourse: Exploring the prospects of online 
deliberative forums extending the public sphere. Information, Communication & Society 4(4): 
615–633. 


test_data\Dahlberg 2011 Re-Constructing Digital Democracy.pdf p.15:
Graham T (2009) What’s wife swap got to do with it? Talking politics in the net-based public sphere. 
Doctoral Dissertation, University of Amsterdam, Amsterdam School of Communications.
Habermas J (1984) The Theory of Communicative Action  (trans. Thomas McCarthy). (V ol. 1, 
Reason and the Rationalization of Society). Boston, MA: Beacon Press. 


test_data\Jungherr & Schroeder 2021 Digital Transformations of the Public Arena.pdf p.78:
Party to the Anti-Trump Resistance. New York: Oxford University Press.
Slater, D

In [57]:
print(f'Prompt passed to the model after retreival and summarisation:\n{result["prompt"]} \n\nLength of the prompt: {len(str(result["prompt"]))}') 

Prompt passed to the model after retreival and summarisation:
text='You are a helpful assistant for finding relevant text passages in scientific literature. \nUse the following pieces of retrieved context to answer the question. Each piece of content starts with the indicators "Source" and "Page". Always provide these in your answer when using one of the retrieved passages.\nIf you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nSource: test_data\\Dahlberg 2011 Re-Constructing Digital Democracy.pdf\nPage:14\nDahlberg L (2001a) Democracy via cyberspace: Examining the rhetorics and practices of three \nprominent camps. New Media & Society 3(2): 187–207. \nDahlberg L (2001b) The internet and democratic discourse: Exploring the prospects of online \ndeliberative forums extending the public sphere. Information, Communication & Society 4(4): \n615–633.\n\nSource: test_data\\Dahlberg 2011 Re-Constructing Digital Democracy.pdf\nPage:15\nGraham T (2009

In [58]:
import json

print(json.dumps(result["answer"], indent=2)) # the sources provided here are not in the provided context - that is, they are made up and not part of the RAG pipeline!

null


The structured output prompting tool seems to lead to considerable hallucinations in the model. Rather than naming the provided sources, as in the non-structured output, the model makes up new sources not provided in the context window. In other words, the sources referred to by the model are NOT part of the RAG pipeline!
Additionally note that, when provided with chunks rather than full pages, the model fails completey, not giving any answer (none/null).

## RAG with Summarisation Step

In [14]:
# check if the folder db_directory already exists. If not, creat it and load the documents into the vector store. Else, use the existing vector store. Makes a new DB for shortened docs

db_directory = "./test_chroma_db" # directory to save the vector store

short_docs = False # for the summarisation pipeline, we can use unshortened docs

retriever_type = "mmr" # "similarity"  or "mmr"

if short_docs:
    db_directory += "_short"

if not os.path.exists(db_directory):
    # load documents
    loader = PyPDFDirectoryLoader(
    "./test_data/")
    docs = loader.load() # metadata tracks paper and page number; each page is a single document

    # optional step: split the docs into smaller chunks to fit into context window of the model (model dependant, necessary for small models) -!! test this, shorter chunks may lead to bad retrieval results !!-
    #           potential remedy: use whole pages, but use the model to summarise each page before chaining it into the context
    if short_docs:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,  # chunk size (characters)
            chunk_overlap=200,  # chunk overlap (characters)
            add_start_index=True,  # track index in original document
            )
        docs = text_splitter.split_documents(docs)

    # make doc vector store. as the vector store can get quite large (and takes time to initialize in memory), we use a chroma database to store the vectors    
    vector_store = Chroma(
        collection_name="lit_helper_test",
        embedding_function=embeddings,
        persist_directory=db_directory,  # save data locally
        )
    vector_store.add_documents(docs) # add docs

else:
    vector_store = Chroma(
        collection_name="lit_helper_test",
        embedding_function=embeddings,
        persist_directory=db_directory,  # save data locally
    )


docs_returned = 6 # number of docs returned by the retriever(s)

# turn the vector store into retriever(s)

if retriever_type == "mmr":
    retriever = vector_store.as_retriever(
        search_type="mmr", # MMR (Maximal Marginal Relevance) aims to diversify search results. the amount of diversification is set via the lambda_mult parameter
        search_kwargs={"k": docs_returned, "fetch_k": 30, "lambda_mult": 0.8}, # make sure the number of documents passed (k) fits into the context window
    )

if retriever_type == "similarity":
    retriever = vector_store.as_retriever(
        search_type="similarity", # similarity score; optionally with threshold ("similarity_score_threshold" with "score_threshold" kwarg)
        search_kwargs={"k": docs_returned}, # make sure the number of documents passed (k) fits into the context window
    )

In [21]:
template = """You are a helpful assistant for finding relevant text passages in scientific literature. 
You can use the following summaries of retrieved text passages to answer the question. Each summary starts with the indicators "Source" and "Page". Always provide these in your answer if using one of the retrieved passages. 
Only use summaries that are relevant to the question at hand.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}

Helpful answer:"""

rag_prompt = PromptTemplate.from_template(template)


summarise_template = """Summarise the following text passage with regard to this question: {question}

{text}

Summary:"""

summarise_prompt = PromptTemplate.from_template(summarise_template)

In [39]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    text: str
    summaries: List[Document]
    context: List[Document]
    answer: str
    prompt: str


def retrieve(state: State):
    retrieved_docs = retriever.invoke(state["question"])
    return {"context": retrieved_docs}
    

def summarise(state: State):
    summaries = []
    for doc in state["context"]:
        text = doc.page_content
        messages = summarise_prompt.invoke({"question": state["question"], "text": text})
        summary = Document( # store the summary as a Document type object for consistency
            page_content = llm.invoke(messages).content,
            metadata = {"source": doc.metadata["source"], "page": doc.metadata["page"]})
        summaries.append(summary)
    return {"summaries": summaries}


def generate(state: State):
    summaries = "\n\n".join(f'Source: {doc.metadata["source"]}\nPage:{doc.metadata["page"]}\n{doc.page_content}' for doc in state["summaries"])
    messages = rag_prompt.invoke({"question": state["question"], "context": summaries})
    response = llm.invoke(messages)
    return {"answer": response.content, 
            "prompt": messages} # return the prompt for debugging purposes


In [40]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, summarise, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [41]:
q = "What is the definition of the public sphere?"

result = graph.invoke({"question": q}) # due to the summarisation step, this will take longer than the previous pipeline

for i in range(len(result["context"])):
    print(f'{result["context"][i].metadata["source"]} p.{result["context"][i].metadata["page"]}.\nText:\n{result["context"][i].page_content}\nSummary:\n{result["summaries"][i].page_content}\n\n')

print(f'Answer: {result["answer"]}')


test_data\Dahlberg 2011 Re-Constructing Digital Democracy.pdf p.2.
Text:
Dahlberg 857
third element refers to the specific ways in which digital media is seen to be supporting 
the development of the other two elements. I have chosen the term affordances as it 
broadly captures how all the positions tend to understand the human–technology 
relationship. In general terms, the relationship is one where the technology is seen to 
have certain features that enable (afford) particular democratic uses and outcomes. 1 
Finally, I explored how digital democracy commentary and practice was positioned in 
relation to these three immanent, interrelated, and mutually effecting elements. This 
enabled me to reconstruct  a number of positions, including the four outlined in this 
article.2 As interpretatively reconstructed representations of the articulations of empirical 
instances, the resulting positions are open to (and indeed invite) challenge and rewriting 
in digital democracy scholarship. 
T

In [43]:
print(f'Prompt passed to the model after retreival and summarisation:\n{result["prompt"]} \n\nLength of the prompt: {len(str(result["prompt"]))}') 

Prompt passed to the model after retreival and summarisation:
text='You are a helpful assistant for finding relevant text passages in scientific literature. \nYou can use the following summaries of retrieved text passages to answer the question. Each summary starts with the indicators "Source" and "Page". Always provide these in your answer if using one of the retrieved passages. \nOnly use summaries that are relevant to the question at hand.\nIf you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\n\nSource: test_data\\Dahlberg 2011 Re-Constructing Digital Democracy.pdf\nPage:2\nThe text passage does not explicitly define the public sphere, but it discusses the concept of "digital democracy" and its relationship with various theoretical positions.\n\nHowever, based on the context, it can be inferred that the author is discussing the concept of digital democracy in relation to the public sphere. The public sphere refers to a space where citizens e

In [34]:
result


{'question': 'What is the definition of the public sphere?',
 'summaries': [Document(metadata={'source': 'test_data\\Dahlberg 2011 Re-Constructing Digital Democracy.pdf', 'page': 2}, page_content='The text passage does not explicitly define the public sphere, but it discusses the concept of "digital democracy" and its relationship with various theoretical positions.\n\nHowever, based on the context, it can be inferred that the author is discussing the concept of digital democracy in relation to the public sphere. The public sphere refers to a space where citizens engage in public discourse, debate, and participation in democratic processes.\n\nThe passage suggests that digital media has enabled new forms of democratic engagement, but also notes that different theoretical positions have emerged to understand this relationship. These positions include liberal-individualist, deliberative, counter-publics, and autonomist Marxist, among others.\n\nWhile the passage does not explicitly defin