# Doing RAG on PDFs using File Search in the Responses API

https://cookbook.openai.com/examples/file_search_responses

## Set up

In [1]:
!pip install PyPDF2 pandas tqdm openai -q

In [4]:
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import concurrent
import PyPDF2
import os
import pandas as pd
import base64
import getpass


In [5]:
def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

In [6]:
_set_env("OPENAI_API_KEY")

In [7]:
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

client

<openai.OpenAI at 0x10dc51b90>

In [9]:
dir_pdfs = 'openai_blog_pdfs' # have those PDFs stored locally here
pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs)]



In [10]:
pdf_files

['openai_blog_pdfs/Virtual_Agent_Chatbot_using_Open_Artificial_Intelligence_Final_.pdf',
 'openai_blog_pdfs/deep_research_blog.pdf',
 'openai_blog_pdfs/agentic-ai-the-new-frontier-in-genai-an-executive-playbook.pdf']

### Creating Vector Store with our PDFs

In [11]:
def upload_single_pdf(file_path: str, vector_store_id: str):
    file_name = os.path.basename(file_path)
    try:
        file_response = client.files.create(file=open(file_path, 'rb'), purpose="assistants")
        attach_response = client.vector_stores.files.create(
            vector_store_id=vector_store_id,
            file_id=file_response.id
        )
        return {"file": file_name, "status": "success"}
    except Exception as e:
        print(f"Error with {file_name}: {str(e)}")
        return {"file": file_name, "status": "failed", "error": str(e)}

def upload_pdf_files_to_vector_store(vector_store_id: str):
    pdf_files = [os.path.join(dir_pdfs, f) for f in os.listdir(dir_pdfs)]
    stats = {"total_files": len(pdf_files), "successful_uploads": 0, "failed_uploads": 0, "errors": []}
    
    print(f"{len(pdf_files)} PDF files to process. Uploading in parallel...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(upload_single_pdf, file_path, vector_store_id): file_path for file_path in pdf_files}
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(pdf_files)):
            result = future.result()
            if result["status"] == "success":
                stats["successful_uploads"] += 1
            else:
                stats["failed_uploads"] += 1
                stats["errors"].append(result)

    return stats

def create_vector_store(store_name: str) -> dict:
    try:
        vector_store = client.vector_stores.create(name=store_name)
        details = {
            "id": vector_store.id,
            "name": vector_store.name,
            "created_at": vector_store.created_at,
            "file_count": vector_store.file_counts.completed
        }
        print("Vector store created:", details)
        return details
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return {}

In [12]:
store_name = "openai_blog_store"
vector_store_details = create_vector_store(store_name)
upload_pdf_files_to_vector_store(vector_store_details["id"])

Vector store created: {'id': 'vs_67d1e4c26488819181015ff2bd06b2d4', 'name': 'openai_blog_store', 'created_at': 1741808834, 'file_count': 0}
3 PDF files to process. Uploading in parallel...


100%|██████████| 3/3 [00:13<00:00,  4.46s/it]


{'total_files': 3, 'successful_uploads': 3, 'failed_uploads': 0, 'errors': []}

### Standalone vector search

[Vector Search API](https://platform.openai.com/docs/api-reference/vector-stores/search)

In [13]:
query = "What's Deep Research?"
search_results = client.vector_stores.search(
    vector_store_id=vector_store_details['id'],
    query=query
)

In [14]:
for result in search_results.data:
    print(str(len(result.content[0].text)) + ' of character of content from ' + result.filename + ' with a relevant score of ' + str(result.score))

3487 of character of content from deep_research_blog.pdf with a relevant score of 0.980603075780475
3378 of character of content from deep_research_blog.pdf with a relevant score of 0.9208701391312681
3639 of character of content from deep_research_blog.pdf with a relevant score of 0.8995781351795552
3034 of character of content from deep_research_blog.pdf with a relevant score of 0.8966293288642615
3187 of character of content from deep_research_blog.pdf with a relevant score of 0.8354302461389134
3300 of character of content from deep_research_blog.pdf with a relevant score of 0.7951640428186445
3228 of character of content from deep_research_blog.pdf with a relevant score of 0.758416484494299
2706 of character of content from deep_research_blog.pdf with a relevant score of 0.7188645402872771
1960 of character of content from deep_research_blog.pdf with a relevant score of 0.6969411311345265
3147 of character of content from deep_research_blog.pdf with a relevant score of 0.691387735

### Integrating search results with LLM in a single API call

In [15]:
query = "What's Deep Research?"
response = client.responses.create(
    input= query,
    model="gpt-4o-mini",
    tools=[{
        "type": "file_search",
        "vector_store_ids": [vector_store_details['id']],
    }]
)

# Extract annotations from the response
annotations = response.output[1].content[0].annotations
    
# Get top-k retrieved filenames
retrieved_files = set([result.filename for result in annotations])

print(f'Files used: {retrieved_files}')
print('Response:')
print(response.output[1].content[0].text) # 0 being the filesearch call

Files used: {'deep_research_blog.pdf'}
Response:
Deep Research is an advanced capability introduced in ChatGPT by OpenAI. It functions as an agent that can autonomously conduct extensive multi-step research tasks on the internet. Here’s a summary of its features:

- **Complex Research Execution**: Deep Research can synthesize information from hundreds of online sources, completing tasks in a fraction of the time it would take a human.

- **Target Audience**: It's designed for professionals in fields like finance, science, and engineering, as well as consumers needing thorough, precise information for informed decisions.

- **Functionality**: Users can request specific analyses or reports, and Deep Research outputs a comprehensive report complete with citations and summaries, making it easy to verify the information.

- **Learning Model**: It utilizes a version of OpenAI’s upcoming model optimized for data analysis and web browsing, capable of reasoning and adapting as it processes new 

In [16]:
from IPython.display import Markdown, display
display(Markdown(response.output_text))

Deep Research is an advanced capability introduced in ChatGPT by OpenAI. It functions as an agent that can autonomously conduct extensive multi-step research tasks on the internet. Here’s a summary of its features:

- **Complex Research Execution**: Deep Research can synthesize information from hundreds of online sources, completing tasks in a fraction of the time it would take a human.

- **Target Audience**: It's designed for professionals in fields like finance, science, and engineering, as well as consumers needing thorough, precise information for informed decisions.

- **Functionality**: Users can request specific analyses or reports, and Deep Research outputs a comprehensive report complete with citations and summaries, making it easy to verify the information.

- **Learning Model**: It utilizes a version of OpenAI’s upcoming model optimized for data analysis and web browsing, capable of reasoning and adapting as it processes new information.

- **Limitations**: While it's designed to be efficient, it may still make mistakes, such as hallucinatory facts, and struggles with confidence calibration in its outputs.

Overall, Deep Research represents a significant step towards achieving more autonomous capabilities in AI, enhancing productivity and decision-making through advanced data synthesis.

In [17]:
query = "What's Agentic AI?"
response = client.responses.create(
    input= query,
    model="gpt-4o-mini",
    tools=[{
        "type": "file_search",
        "vector_store_ids": [vector_store_details['id']],
    }]
)

# Extract annotations from the response
annotations = response.output[1].content[0].annotations
    
# Get top-k retrieved filenames
retrieved_files = set([result.filename for result in annotations])

print(f'Files used: {retrieved_files}')
print('Response:')
display(Markdown(response.output_text))

Files used: {'agentic-ai-the-new-frontier-in-genai-an-executive-playbook.pdf'}
Response:


Agentic AI generally refers to advanced AI systems that possess the ability to make autonomous decisions and take actions to achieve specific goals with limited or no direct human intervention. Key characteristics include:

1. **Autonomy**: These systems can operate independently, making decisions based on their programming and environmental inputs.
2. **Goal-oriented Behavior**: They are designed to pursue specific objectives and optimize actions for desired outcomes.
3. **Environmental Interaction**: Agentic AI interacts with its surroundings, perceiving changes and adapting strategies.
4. **Learning Capability**: Many systems use machine learning techniques to improve performance over time.
5. **Workflow Optimization**: They enhance business processes by integrating language understanding with reasoning and decision-making.
6. **Multi-agent Communication**: Agentic AI facilitates communication among different agents to build complex workflows.

This technology is transforming various sectors such as healthcare, finance, and retail by automating routine tasks, enhancing decision-making, and improving customer experiences.

In [18]:
query = "How can businesses use agentic AI solutions for business operations?"
response = client.responses.create(
    input= query,
    model="gpt-4o-mini",
    tools=[{
        "type": "file_search",
        "vector_store_ids": [vector_store_details['id']],
    }]
)

# Extract annotations from the response
annotations = response.output[1].content[0].annotations
    
# Get top-k retrieved filenames
retrieved_files = set([result.filename for result in annotations])

print(f'Files used: {retrieved_files}')
print('Response:')
display(Markdown(response.output_text))

Files used: {'agentic-ai-the-new-frontier-in-genai-an-executive-playbook.pdf'}
Response:


Businesses can leverage agentic AI solutions in various ways to enhance their operations and overall performance:

1. **Process Automation**: Agentic AI can automate routine tasks, allowing organizations to reduce operational costs and boost efficiency. For instance, in customer service, AI can manage common inquiries, enabling human agents to focus on more complex issues.

2. **Data Analysis and Insights**: These systems can analyze vast amounts of data quickly and provide actionable insights. Companies can use these insights for better decision-making, market trend prediction, and optimizing revenue and operations.

3. **Enhanced Customer Experience**: AI-powered chatbots and virtual assistants can offer personalized and instant support, improving customer engagement and satisfaction. For example, e-commerce platforms often utilize AI to recommend products based on user behavior, which can lead to increased sales.

4. **Service-as-a-Software Model**: This innovative model allows businesses to outsource specific tasks to AI agents, paying only for outcomes rather than software licenses or subscriptions. This approach can significantly reduce costs and enhance operational scale.

5. **Agility and Responsiveness**: By automating workflows, businesses can enhance their agility, allowing them to adapt quickly to changing market demands and operational challenges.

6. **Improve Decision-Making**: With real-time data analysis, agentic AI can facilitate improved decision-making processes, affecting sectors like finance and healthcare positively by streamlining operations and enhancing accuracy.

7. **Integration with Existing Systems**: These AI systems can integrate seamlessly with current tools and workflows, optimizing resource allocation and enhancing communication and collaboration across an organization.

8. **Ethical and Responsible Use**: Businesses should ensure their AI systems are developed and implemented ethically, considering transparency and accountability in decision-making processes.

In summary, agentic AI solutions facilitate enhanced operational efficiency, better customer service, insightful data analysis, and more strategic decision-making, positioning businesses advantageously in competitive environments.

In [19]:
query = "Define the following terms: Generative AI, LLMs, and Langchain"
response = client.responses.create(
    input= query,
    model="gpt-4o-mini",
    tools=[{
        "type": "file_search",
        "vector_store_ids": [vector_store_details['id']],
    }]
)

# Extract annotations from the response
annotations = response.output[1].content[0].annotations
    
# Get top-k retrieved filenames
retrieved_files = set([result.filename for result in annotations])

print(f'Files used: {retrieved_files}')
print('Response:')
display(Markdown(response.output_text))

Files used: {'Virtual_Agent_Chatbot_using_Open_Artificial_Intelligence_Final_.pdf'}
Response:


Here are the definitions of the requested terms:

### Generative AI
Generative AI refers to algorithms that can generate novel content, rather than just analyzing or acting on existing data. This includes applications like text generation, image synthesis, and even code generation. Generative AI is seen as capable of exhibiting a form of creativity, pushing technology into realms previously reserved for human creativity.

### LLMs (Large Language Models)
LLMs are advanced AI models designed to understand and generate human language. They capture the structure of language through vast amounts of training data, allowing them to perform various natural language processing tasks such as text generation, translation, summarization, and sentiment analysis. Their capabilities span multiple domains, including content creation and code development.

### LangChain
LangChain is an open-source Python framework created to facilitate the development of applications powered by large language models. It provides developers with reusable components that connect language models with external data sources. Its modular architecture simplifies the integration of LLMs, allowing developers to build complex workflows, manage conversational context, and incorporate memory systems for applications like chatbots.

In [20]:
query = "How to build a virtual agent(chatbot) using OpenAI?"
response = client.responses.create(
    input= query,
    model="gpt-4o-mini",
    tools=[{
        "type": "file_search",
        "vector_store_ids": [vector_store_details['id']],
    }]
)

# Extract annotations from the response
annotations = response.output[1].content[0].annotations
    
# Get top-k retrieved filenames
retrieved_files = set([result.filename for result in annotations])

print(f'Files used: {retrieved_files}')
print('Response:')
display(Markdown(response.output_text))

Files used: {'agentic-ai-the-new-frontier-in-genai-an-executive-playbook.pdf', 'Virtual_Agent_Chatbot_using_Open_Artificial_Intelligence_Final_.pdf'}
Response:


To create a virtual agent (chatbot) using OpenAI, you can follow these key steps based on the relevant methods outlined in the document you provided:

### 1. **Select a Framework and Libraries**
   - Use **LangChain** as the main framework, which simplifies integration with various language models (LLMs) such as Google's Gemini Pro or OpenAI's models. It provides a modular architecture for building LLM applications.
   - For the front-end, utilize **Streamlit**, a Python library that allows for easy creation of web applications.

### 2. **Set Up Your Environment**
   - Install the necessary packages:
     ```bash
     pip install streamlit langchain-google-genai
     ```

### 3. **Code Structure**
   Create a Python script that initializes the chatbot. Here is a simplified pseudo-code:

   ```python
   import streamlit as st
   from langchain_google_genai import ChatGoogleGenerativeAI

   # Initialize the model
   llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key="YOUR_API_KEY")

   def generate_response(user_input):
       return llm.invoke(user_input).content

   def main():
       st.title("Virtual Agent (Chatbot)")
       user_input = st.chat_input("Ask away!")
       if user_input:
           bot_response = generate_response(user_input)
           st.chat_message("assistant").markdown(bot_response)

   if __name__ == "__main__":
       main()
   ```

### 4. **Handle User Input and Responses**
   - Use Streamlit's session state to track conversation history. This way, you can maintain context across user interactions.
   - Display previous messages and respond to user inputs in real-time.

### 5. **Deploy Your Application**
   - Streamlit allows you to deploy your app easily. You might need to deploy it using free hosting, such as Streamlit Sharing, after signing up.

### 6. **Test and Iterate**
   - Ensure that your chatbot can handle various queries and provide meaningful answers. Iterate on your design based on user feedback and performance insights.

### 7. **Future Enhancements**
   - Consider adding multimodal capabilities to handle text, images, and audio interactions in future iterations.
   - Look into making it a decentralized application if you are concerned about privacy and data security.

### Conclusion
By leveraging frameworks like LangChain and Streamlit, you can effectively build and deploy a functional virtual agent. The use of external APIs such as Google's Gemini enhances the chatbot's capabilities.

### Evaluating performance
#### Generating questions

We will create functions that will read through the PDFs we have locally and generate a question that can only be answered by this document. Therefore it'll create our evaluation dataset that we can use after.

In [25]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

def generate_questions(pdf_path):
    text = extract_text_from_pdf(pdf_path)

    prompt = (
        "Can you generate a question that can only be answered from this document?:\n"
        f"{text}\n\n"
    )

    response = client.responses.create(
        input=prompt,
        model="gpt-4o",
    )

    question = response.output[0].content[0].text

    return question

In [26]:
generate_questions(pdf_files[0])

'What was the main obstacle faced by Muthana Alsaadi, Ammar Abdulzahra Alabbassi, and their team in developing their virtual agent (chatbot) project, and how did they overcome it?'

We can now generate all the questions for all the PDFs we've got stored locally.

In [27]:
# Generate questions for each PDF and store in a dictionary
questions_dict = {}
for pdf_path in pdf_files:
    questions = generate_questions(pdf_path)
    questions_dict[os.path.basename(pdf_path)] = questions

In [28]:
questions_dict

{'Virtual_Agent_Chatbot_using_Open_Artificial_Intelligence_Final_.pdf': 'What major obstacle did the authors face when trying to use Microsoft Azure services for their chatbot project in Iraq?',
 'deep_research_blog.pdf': 'What is the name of the newly launched agentic capability in ChatGPT mentioned in this document?',
 'agentic-ai-the-new-frontier-in-genai-an-executive-playbook.pdf': 'What are the phases of evolution for agentic frameworks as outlined in the document?'}

We'll convert our dictionary into a dataframe and process it using gpt-4o-mini. We will look out for the expected file

In [29]:
rows = []
for filename, query in questions_dict.items():
    rows.append({"query": query, "_id": filename.replace(".pdf", "")})

# Metrics evaluation parameters
k = 5
total_queries = len(rows)
correct_retrievals_at_k = 0
reciprocal_ranks = []
average_precisions = []

def process_query(row):
    query = row['query']
    expected_filename = row['_id'] + '.pdf'
    # Call file_search via Responses API
    response = client.responses.create(
        input=query,
        model="gpt-4o-mini",
        tools=[{
            "type": "file_search",
            "vector_store_ids": [vector_store_details['id']],
            "max_num_results": k,
        }],
        tool_choice="required" # it will force the file_search, while not necessary, it's better to enforce it as this is what we're testing
    )
    # Extract annotations from the response
    annotations = None
    if hasattr(response.output[1], 'content') and response.output[1].content:
        annotations = response.output[1].content[0].annotations
    elif hasattr(response.output[1], 'annotations'):
        annotations = response.output[1].annotations

    if annotations is None:
        print(f"No annotations for query: {query}")
        return False, 0, 0

    # Get top-k retrieved filenames
    retrieved_files = [result.filename for result in annotations[:k]]
    if expected_filename in retrieved_files:
        rank = retrieved_files.index(expected_filename) + 1
        rr = 1 / rank
        correct = True
    else:
        rr = 0
        correct = False

    # Calculate Average Precision
    precisions = []
    num_relevant = 0
    for i, fname in enumerate(retrieved_files):
        if fname == expected_filename:
            num_relevant += 1
            precisions.append(num_relevant / (i + 1))
    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    
    if expected_filename not in retrieved_files:
        print("Expected file NOT found in the retrieved files!")
        
    if retrieved_files and retrieved_files[0] != expected_filename:
        print(f"Query: {query}")
        print(f"Expected file: {expected_filename}")
        print(f"First retrieved file: {retrieved_files[0]}")
        print(f"Retrieved files: {retrieved_files}")
        print("-" * 50)
    
    
    return correct, rr, avg_precision

In [30]:
process_query(rows[0])

(True, 1.0, 1.0)

Recall & Precision are at 1 for this example, and our file ranked first so we're having a MRR and MAP = 1 on this example.

We can now execute this processing on our set of questions.

In [31]:
with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_query, rows), total=total_queries))

correct_retrievals_at_k = 0
reciprocal_ranks = []
average_precisions = []

for correct, rr, avg_precision in results:
    if correct:
        correct_retrievals_at_k += 1
    reciprocal_ranks.append(rr)
    average_precisions.append(avg_precision)

recall_at_k = correct_retrievals_at_k / total_queries
precision_at_k = recall_at_k  # In this context, same as recall
mrr = sum(reciprocal_ranks) / total_queries
map_score = sum(average_precisions) / total_queries

100%|██████████| 3/3 [00:07<00:00,  2.51s/it]


In [32]:
# Print the metrics with k
print(f"Metrics at k={k}:")
print(f"Recall@{k}: {recall_at_k:.4f}")
print(f"Precision@{k}: {precision_at_k:.4f}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.4f}")
print(f"Mean Average Precision (MAP): {map_score:.4f}")

Metrics at k=5:
Recall@5: 1.0000
Precision@5: 1.0000
Mean Reciprocal Rank (MRR): 1.0000
Mean Average Precision (MAP): 1.0000
