In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
load_dotenv()

True

In [5]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [6]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats")

AIMessage(content='Why was the cat sitting on the computer?\n\nBecause it wanted to keep an eye on the mouse!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 13, 'total_tokens': 33, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_74ba47b4ac', 'finish_reason': 'stop', 'logprobs': None}, id='run-ef85c9f8-0686-4753-8ac1-85c41e4089bf-0', usage_metadata={'input_tokens': 13, 'output_tokens': 20, 'total_tokens': 33})

## Process PDFs

### Load PDF document

In [7]:
loader = PyPDFLoader("data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf")
pages = loader.load()
pages

Ignoring wrong pointing object 18 0 (offset 0)


[Document(metadata={'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf', 'page': 0}, page_content='APPLIED COGNITIVE PSYCHOLOGYAppl. Cognit. Psychol.20: 139–156 (2006)Published online 31 October 2005 in Wiley InterScience(www.interscience.wiley.com) DOI: 10.1002/acp.1178Consequences of Erudite Vernacular Utilized Irrespectiveof Necessity: Problems with Using Long Words NeedlesslyDANIEL M. OPPENHEIMER*Princeton University, USASUMMARYMost texts on writing style encourage authors to avoid overly-complex words. However, a majorityof undergraduates admit to deliberately increasing the complexity of their vocabulary so as to givethe impression of intelligence. This paper explores the extent to which this strategy is effective.Experiments 1–3 manipulate complexity of texts and ﬁnd a negative relationship between complex-ity and judged intelligence. This relationship held regardless of the quality of the original essay, andirrespective of the participants’ prior expectations of 

### Split document

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

### Create embeddings

In [11]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("cat")


In [13]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                            embeddings=embedding_function)

evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

{'score': 0.17454945643404907}

## Create vector database

In [18]:
import uuid 

def create_vectorstore(chunks, embedding_function, vectorstore_path):
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    unique_ids = set()
    unique_chunks = []
    
    for chunk, id in zip(chunks, ids):
        if id not in unique_ids:
            unique_ids.add(id)
            unique_chunks.append(chunk)
    
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory=vectorstore_path)
    vectorstore.persist()
    return vectorstore

In [19]:
vectorstore = create_vectorstore(chunks, embedding_function, "vectorstore_chroma")

## Query relevant data

In [20]:
vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

  vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)


In [21]:
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("What is the title of the article")
relevant_chunks

[Document(metadata={'page': 1, 'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf'}, page_content='Copyright#2005 John Wiley & Sons, Ltd. Appl. Cognit. Psychol. 20: 139–156 (2006)'),
 Document(metadata={'page': 0, 'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf'}, page_content='language?’ 86.4% of the sample admitted to having done so. Nearly two-thirds answered yes to the question, ‘When you write an essay, do you turn to the thesaurusto choose words that are more complex to give the impression that the content is morevalid or intelligent?’Copyright#2005 John Wiley & Sons, Ltd.*Correspondence to: D. M. Oppenheimer, Department of Psychology, Princeton University, Green Hall Room2-S-8, Princeton, NJ 08540, USA. E-mail: doppenhe@princeton.edu'),
 Document(metadata={'page': 1, 'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf'}, page_content='surprising readers with the relative disﬂuency of the text.Both the experts and prevailing wisdom p

In [22]:
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

## Generate response

In [23]:
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="What is the title of the paper?")
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

Copyright#2005 John Wiley & Sons, Ltd. Appl. Cognit. Psychol. 20: 139–156 (2006)

---

language?’ 86.4% of the sample admitted to having done so. Nearly two-thirds answered yes to the question, ‘When you write an essay, do you turn to the thesaurusto choose words that are more complex to give the impression that the content is morevalid or intelligent?’Copyright#2005 John Wiley & Sons, Ltd.*Correspondence to: D. M. Oppenheimer, Department of Psychology, Princeton University, Green Hall Room2-S-8, Princeton, NJ 08540, USA. E-mail: doppenhe@princeton.edu

---

surprising readers with the relative disﬂuency of the text.Both the experts and prevailing wisdom present plausible views, but which (if either) iscorrect? The present paper provides an empirical investigation of the strategy 

In [24]:
llm.invoke(prompt)

AIMessage(content='The title of the paper is "Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 773, 'total_tokens': 805, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_f85bea6784', 'finish_reason': 'stop', 'logprobs': None}, id='run-a17543d2-e8c0-4ecf-b109-7abf385ff55d-0', usage_metadata={'input_tokens': 773, 'output_tokens': 32, 'total_tokens': 805})

## Using Langchain Expression language

In [25]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("What's the title of this paper?")

AIMessage(content='The title of the paper is "Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 767, 'total_tokens': 799, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_f85bea6784', 'finish_reason': 'stop', 'logprobs': None}, id='run-4591d8b8-f3a7-4cdc-b772-6d8d56d87a6c-0', usage_metadata={'input_tokens': 767, 'output_tokens': 32, 'total_tokens': 799})

## Generate structured response

In [27]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    
class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources

In [28]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )

rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")

ExtractedInfo(paper_title=AnswerWithSources(answer='Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly', sources='SUMMARYMost texts on writing style encourage authors to avoid overly-complex words...', reasoning='The title is explicitly mentioned in the retrieved context.'), paper_summary=AnswerWithSources(answer='Most texts on writing style encourage authors to avoid overly-complex words. However, a majority of undergraduates admit to deliberately increasing the complexity of their vocabulary so as to give the impression of intelligence. This paper explores the extent to which this strategy is effective.', sources='SUMMARYMost texts on writing style encourage authors to avoid overly-complex words. However, a majority...', reasoning='The summary is derived from the first few sentences of the retrieved context, which outline the main focus of the paper.'), publication_year=AnswerWithSources(answer='2006', sources='Appl. Cogni

## Transform response into a dataframe

In [29]:
structured_response = rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")
df = pd.DataFrame([structured_response.dict()])

# Transforming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer', 'source', 'reasoning'])
structured_response_df

Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors
answer,Consequences of Erudite Vernacular Utilized Ir...,Most texts on writing style encourage authors ...,2005,Daniel M. Oppenheimer
source,Consequences of Erudite Vernacular Utilized Ir...,SUMMARYMost texts on writing style encourage a...,Published online 31 October 2005 in Wiley Inte...,"DANIEL M. OPPENHEIMER*Princeton University, USA"
reasoning,The title is explicitly mentioned in the conte...,The summary captures the main findings and pur...,The publication date is explicitly mentioned a...,The author's name is directly stated in the co...
