In [1]:
!python -m pip install --upgrade pip




In [2]:
!pip install --upgrade --quiet langchain langchain-community langchain-openai

In [3]:
!pip install --upgrade --quiet chromadb pypdf streamlit python-dotenv

## Import packages

In [4]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv

In [6]:
load_dotenv()

True

In [7]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Define LLM


In [8]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
#llm.invoke("Tell me a joke about cats")

## Process PDF document

### Load PDF document

In [9]:
loader = PyPDFLoader("data/test_rag2.pdf")
pages = loader.load()


In [10]:
pages

[Document(metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'Elsevier', 'creationdate': '2024-12-05T14:24:32+00:00', 'crossmarkdomains[1]': 'elsevier.com', 'crossmarkmajorversiondate': '2010-04-23', 'creationdate--text': '5th December 2024', 'elsevierwebpdfspecifications': '7.0.1', 'robots': 'noindex', 'moddate': '2024-12-05T14:24:42+00:00', 'author': 'Katri Lahti', 'doi': '10.1016/j.chiabu.2024.107140', 'title': 'Victimization, immigration status, and psychosocial well-being: A representative study among finnish adolescents', 'keywords': 'Adolescence,Immigration,Victimization,Response integrity', 'subject': 'Child Abuse & Neglect, 158 (2024) 107140. doi:10.1016/j.chiabu.2024.107140', 'crossmarkdomains[2]': 'sciencedirect.com', 'crossmarkdomainexclusive': 'true', 'source': 'data/test_rag2.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}, page_content='Victimization, immigration status, and psychosocial well-being: A \nrepresentative study among finnish adolesc

### split document into chunks

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,
                                               chunk_overlap=200,
                                               length_function=len,
                                               separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

### create embedding


In [12]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key = OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
#test_vector = embedding_function.embed_query("cat")

In [13]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance",
                           embeddings=embedding_function)

#evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

### Create vector database

In [14]:
import uuid

# create a new Chroma database from the documents
def create_vectorstore(chunks, embedding_function, vectorstore_path):
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    # ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = []
    for chunk, id in zip(chunks, ids):
        if id not in unique_ids:
            unique_ids.add(id)
            unique_chunks.append(chunk)
            
    vectorstore = Chroma.from_documents(documents=unique_chunks,
                                        ids=list(unique_ids),
                                        embedding=embedding_function,
                                        persist_directory=vectorstore_path)
    vectorstore.persist()
    return vectorstore

In [15]:
# create vectorstore
vectorstore = create_vectorstore(chunks=chunks,
                                 embedding_function=embedding_function,
                                 vectorstore_path="vectorstore_chroma")

  vectorstore.persist()


## 2. Query for relevant data

In [16]:
# load vectorstore
vectorstore = Chroma(persist_directory= "vectorstore_chroma", embedding_function=embedding_function)

  vectorstore = Chroma(persist_directory= "vectorstore_chroma", embedding_function=embedding_function)


In [17]:
# create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("What is the title of the article?")
for chunk in relevant_chunks:
    print(chunk)

page_content='Received 30 June 2024; Received in revised form 2 November 2024; Accepted 4 November 2024  
Child Abuse & Neglect 158 (2024) 107140 
Available online 19 November 2024 
0145-2134/© 2024 The Authors. Published by Elsevier Ltd. This is an open access article under the CC BY license 
( http://creativecommons.org/licenses/by/4.0/ ).' metadata={'author': 'Katri Lahti', 'creationdate': '2024-12-05T14:24:32+00:00', 'creationdate--text': '5th December 2024', 'creator': 'Elsevier', 'crossmarkdomainexclusive': 'true', 'crossmarkdomains[1]': 'elsevier.com', 'crossmarkdomains[2]': 'sciencedirect.com', 'crossmarkmajorversiondate': '2010-04-23', 'doi': '10.1016/j.chiabu.2024.107140', 'elsevierwebpdfspecifications': '7.0.1', 'keywords': 'Adolescence,Immigration,Victimization,Response integrity', 'moddate': '2024-12-05T14:24:42+00:00', 'page': 0, 'page_label': '1', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'robots': 'noindex', 'source': 'data/test_rag2.pdf', 'subject': 'Child Abuse

In [18]:
# prompt template
PROMPT_TEMPLATE = """ 
You are an assistant for question_answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

In [28]:
# concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text,
                                question="What is the abstract?")

print(prompt)


Human:  
You are an assistant for question_answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

Received 30 June 2024; Received in revised form 2 November 2024; Accepted 4 November 2024  
Child Abuse & Neglect 158 (2024) 107140 
Available online 19 November 2024 
0145-2134/© 2024 The Authors. Published by Elsevier Ltd. This is an open access article under the CC BY license 
( http://creativecommons.org/licenses/by/4.0/ ).

---

European Journal of Criminology, 9 (1), 23 – 37. https://doi.org/10.1177/1477370811421644
Waehrer, G. M., Miller, T. R., Silverio Marques, S. C., Oh, D. L., & Burke Harris, N. (2020). Disease burden of adverse childhood experiences across 14 states. PLoS One, 
15 (1), Article e0226134. https://doi.org/10.1371/journal.pone.0226134
Xie, M., & Baumer, E. P. (2020). Immigrant status, citizenship, and victimization risk in the United States: New findings 

## 3.Generate responses

In [29]:
llm.invoke(prompt)

AIMessage(content="I don't know.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 1004, 'total_tokens': 1009, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_7fcd609668', 'finish_reason': 'stop', 'logprobs': None}, id='run-827380c3-7bc7-479b-b428-8b5804621729-0', usage_metadata={'input_tokens': 1004, 'output_tokens': 5, 'total_tokens': 1009, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

### Using Langchain expression language

In [31]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    |llm
)
rag_chain.invoke("what is the second paragraph?")

AIMessage(content="The second paragraph discusses the inclusion of meta-questions regarding parental corporal punishment and sexual abuse in the study's questionnaire. It mentions that these questions were presented to every respondent, regardless of their earlier answers about victimization experiences. Participants who denied the target victimization question were included in further analyses, and if a respondent answered a question about physical abuse, the corresponding meta-question was presented later. The questions were answered in a dichotomous manner (yes or no).", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 93, 'prompt_tokens': 612, 'total_tokens': 705, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_709714d

### Generate structured responses

In [32]:
class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_title: str= Field(description="Title of the paper")
    paper_summary: str= Field(description="Summary of the paper")
    publication_year: int = Field(description="Year of publication of the paper")
    paper_authors: str = Field(description="Names of the authors of the paper")

In [33]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    |prompt_template
    |llm.with_structured_output(ExtractedInfo, strict=True)
)

rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")



ExtractedInfo(paper_title='rates of post-traumatic stress disorder in trauma-exposed children and adolescents: meta-analysis', paper_summary='This work was supported by the Foundation for Pediatric Research and the Helsinki University Hospital Research Funds, focusing on the rates of post-traumatic stress disorder in trauma-exposed children and adolescents.', publication_year=2024, paper_authors='Alisic, E., Zalta, A. K., van Wesel, F., Larsen, S. E., Hafstad, G. S., Hassanpour, K., & Smid, G. E.')