In [2]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb 
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

In [8]:
# Construct path to .env file in parent directory
#dotenv_path_parent = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

False

In [13]:
# Load environment variables from .env file 
#by default search in the same directory
load_dotenv() #if printing out True: can see the path; if False: cannot see

True

## Define our LLM

In [14]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
#"gpt-4o-mini" is a smaller, cheap model, suitable for smaller tasks
#llm.invoke("Tell me a joke about cats")

# Part 1 RAG: Process

## Process PDF document

### Load PDF document

In [15]:
loader = PyPDFLoader("data/han_qsar.pdf")
pages = loader.load()
pages #load the text per pages
#returns a list of document objects. Each document object represents a page in the pdf.

[Document(metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'Elsevier', 'creationdate': '2023-06-19T03:00:26+00:00', 'crossmarkdomains[1]': 'elsevier.com', 'crossmarkmajorversiondate': '2010-04-23', 'creationdate--text': '19th June 2023', 'elsevierwebpdfspecifications': '7.0', 'robots': 'noindex', 'moddate': '2023-06-19T06:09:43+00:00', 'author': 'Han Ngoc Bao Nguyen', 'doi': '10.1016/j.rechem.2023.100888', 'title': 'Quantitative Structure-Activity Relationship (QSAR) modelling of the activity of anti-colorectal cancer agents featuring quantum chemical predictors and interaction terms', 'keywords': 'Anti-colorectal cancer agents,Interaction terms,Logistic regression,QSAR,Quantum chemical predictors', 'subject': 'Results in Chemistry, 5 (2023) 100888. doi:10.1016/j.rechem.2023.100888', 'crossmarkdomains[2]': 'sciencedirect.com', 'crossmarkdomainexclusive': 'true', 'source': 'data/han_qsar.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='Results in

* returns a list of document objects. Each document object represents a page in the pdf.
* a single document can be very long. cannot pass the giant chunk of text to LLM (1) there is a token limit to the requests we make on LLM 
* for document QA task, the answer to users' questions only lie in some parts of the document. we only need to put those relevant information as context to LLM to make it give the correct answer

### Split document

* split the document into even smaller chunks
* RecursiveCharacterTextSplitter: available from Langchain
    * chunk_size: how many characters does each chunk contain
    * separators: chunks are broken either double lines, line, or space
    * the function will do the best to split according to our contents
    * output: a list of chunks
    * play with this function to create larger or smaller chunks
    * if too big: too much irrelvant information
    * if too small: not enough context
    

In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

In [18]:
len(chunks)

32

### Create embeddings

* TExt embeddings are ways of representing words and documents as numerical vectors that capture their meaning 
* the format that computers can contain and work with
* the vector is a huge vector - a coordinate in a very high dimensional space
* similar pieces of text will have vectors that are closer to each other in terms of meaning in this high dimensional space
* the distance between those vectors can be calculated using eucledian distance/ cosine similarity 
* many text embedding models are available. Good embedding -> better at capturing meaning of the test

In [9]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", #is an embedding function from OpenAI
        openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("cat") #turn the word cat into a vector

In [10]:
from langchain.evaluation import load_evaluator #use langchain to calculate difference between 2 pieces of text

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

{'score': 0.17440875566198188}

In [13]:
evaluator.evaluate_strings(prediction="Paris", reference="coffeeshop")

{'score': 0.22417909850229667}

* paris and coffeshop are less similar than amsterdam and coffeeshop.

### Create vector database

* Need a way to create, manage and query the embedding vector in a good way
* A vector database: like a library. We store chunks of information represented in vectors. WE use Chroma vector database in this function.
* How does this vector database query information?
- question: "What is the conclusion of the paper"
- database creates the embedding question
- database scans through all the embedding vectors to return the most relevant chunk to the question based on the similarity distance we calculated.
- this chunk is fed into LLM to generate answers

In [None]:
# create a Chroma database
vectorstore = Chroma.from_documents(documents=chunks,
                                    embedding= embedding_function,
                                    persist_diretory = "vectorstore" #a diretory path to save the vectordatabase later
                                    )
vectorstore.persis() #the vector store should be saved directly, but we can force save using this 

One problem
* if you happen to create vector embeddings for the same document twice, Chroma will save as 2 different chunks
* so Chroma will contain duplicated chunks
* so we implement a filter here, and only add unique documents with unique content to the database

In [18]:
import uuid
#wrap the whole thing in the function that takes chunks, embedding function and the path to vector store
def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, #take our chunks
                                        ids=list(unique_ids),
                                        embedding=embedding_function, #embedding function for each chunk
                                        persist_directory = vectorstore_path)

    vectorstore.persist()
    
    return vectorstore

In [19]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore_test")

## 2. Query for relevant data

In [59]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [21]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity") #search_type: we are using the cosine distance to determine similarity
relevant_chunks = retriever.invoke("What is the title of the paper?") #for example, if we ask
relevant_chunks #we get back the relevant chunks - feed this to the LLM to create a high quality response

[Document(metadata={'page': 1, 'source': 'data/1995-watanabe.pdf'}, page_content='Am. J. Physiol. 250, G85~91. \nWatanabe S. and Dawes C. (1988) The effects of different \nfoods and concentrations of citric acid on the flow rate of \nwhole saliva in man. Archs oral Biol. 33, 1-5. \nWatanabe S. and Dawes C. (1990) Salivary flow rates and \nsalivary film thickness in five-year-old children. J. dent. \nRes. 69, 1150-1153. \nWatanabe S. (1992) Salivary clearance from different re- \ngions of the mouth in children. Caries Res. 26, 423-427.'),
 Document(metadata={'page': 1, 'source': 'data/1995-watanabe.pdf'}, page_content='grants-in-aid from the Ministry of Education, Science, and \nCulture of Japan, Grants 02807189, 05671719 (to S.W.), and \nby a Grant from the Research Foundation of the Health \nSciences University of Hokkaido (to S.W.). \nREFERENCES \nBecks H. and Wainwright W. W. (1943) Human saliva XIII. \nRate of flow of resting saliva of healthy individuals. \nJ. dent. Res. 22, 391-3

In [None]:
# Prompt template: create a system prompt for LLM
# placeholder for the context: chunks of information from the retriever
# question: the aacutal question
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

## 3 - Generate response in the simple way:

In [62]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="What is the title of the paper?")
print(prompt) #the prompt object will have the entire prompt, with the question, the query, all chunks of information

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

Copyright#2005 John Wiley & Sons, Ltd. Appl. Cognit. Psychol. 20: 139–156 (2006)

---

language?’ 86.4% of the sample admitted to having done so. Nearly two-thirds answered yes to the question, ‘When you write an essay, do you turn to the thesaurusto choose words that are more complex to give the impression that the content is morevalid or intelligent?’Copyright#2005 John Wiley & Sons, Ltd.*Correspondence to: D. M. Oppenheimer, Department of Psychology, Princeton University, Green Hall Room2-S-8, Princeton, NJ 08540, USA. E-mail: doppenhe@princeton.edu

---

APPLIED COGNITIVE PSYCHOLOGYAppl. Cognit. Psychol.20: 139–156 (2006)Published online 31 October 2005 in Wiley InterScience(www.interscience.wiley.com) DOI: 10.1002/acp.1178Consequences of Erudite Vernacular Utilized Irrespecti

In [65]:
llm.invoke(prompt)

AIMessage(content='The title of the paper is "Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 773, 'total_tokens': 805}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_483d39d857', 'finish_reason': 'stop', 'logprobs': None}, id='run-e2017e06-5717-4a90-a7dc-47634ed2728f-0', usage_metadata={'input_tokens': 773, 'output_tokens': 32, 'total_tokens': 805})

## 3-2. Generate responses in the Langchain way: Langchain Expression Language
* Chain all steps and functions together

### Using Langchain Expression Language

In [66]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("What's the title of this paper?")

AIMessage(content='The title of the paper is "Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 767, 'total_tokens': 799}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_483d39d857', 'finish_reason': 'stop', 'logprobs': None}, id='run-b478d845-c13f-4ee4-868d-a199f8534a9f-0', usage_metadata={'input_tokens': 767, 'output_tokens': 32, 'total_tokens': 799})

### Generate structured responses

In [73]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    
class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources

In [74]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )

rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")

ExtractedInfo(paper_title=AnswerWithSources(answer='Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly', sources='Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly', reasoning='The title is explicitly mentioned in the context provided.'), paper_summary=AnswerWithSources(answer='Most texts on writing style encourage authors to avoid overly-complex words. However, a majority of undergraduates admit to deliberately increasing the complexity of their vocabulary so as to give the impression of intelligence. This paper explores the extent to which this strategy is effective. Experiments 1–3 manipulate complexity of texts and find a negative relationship between complexity and judged intelligence.', sources='Most texts on writing style encourage authors to avoid overly-complex words. However, a majority of undergraduates admit to deliberately increasing the complexity o

### Transform response into a dataframe

In [75]:
structured_response = rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")
df = pd.DataFrame([structured_response.dict()])

# Transforming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer', 'source', 'reasoning'])
structured_response_df

Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors
answer,Consequences of Erudite Vernacular Utilized Ir...,The paper explores the negative relationship b...,2006,Daniel M. Oppenheimer
source,"Copyright#2005 John Wiley & Sons, Ltd. Appl. C...",Most texts on writing style encourage authors ...,Appl. Cognit. Psychol. 20: 139–156 (2006),"Correspondence to: D. M. Oppenheimer, Departme..."
reasoning,The title is explicitly mentioned at the begin...,The summary is derived from the overall conten...,The publication year is indicated in the citat...,The author’s name is provided in the correspon...
