# PDF Analysis and Querying using LLMSherpa and OpenAI
This notebook demonstrates the process of analyzing a PDF document using LLMSherpa and LlamaIndex, integrated with OpenAI's API for natural language processing.

In [None]:
!pip install llmsherpa
!pip install llama-index

In [2]:
import llmsherpa
from llmsherpa.readers import LayoutPDFReader
from llama_index.llms import OpenAI
from llama_index.readers.schema.base import Document
from llama_index import VectorStoreIndex
import openai
from IPython.core.display import display, HTML

In [3]:
# Load LLMSherpa API and PDF
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_url = "https://omscs.gatech.edu/sites/default/files/documents/Other_docs/spring_2023_orientation_document.pdf" # also can do file path
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
doc = pdf_reader.read_pdf(pdf_url)

In [4]:
# Insert OpenAI API key
openai.api_key = ""

## Method 1: Manually select a Specific Section and feed to ChatGPT

In [None]:
selected_section = None
x = 0
for section in doc.sections():
    if section.title == 'SECTION B. FOUNDATIONAL COURSE REQUIREMENT' and x == 1:
        selected_section = section
        break
    elif section.title == 'SECTION B. FOUNDATIONAL COURSE REQUIREMENT':
        x += 1
HTML(selected_section.to_html(include_children = True, recurse = True))

In [None]:
context = selected_section.to_html(include_children=True, recurse=True)
question = "list all the tasks discussed and one line about each task"
resp = OpenAI().complete(f"read this text and answer question: {question}:{context}")
print(resp.text)

## Method 2: Vector Search and RAG with Smart Chunking

In [None]:
index = VectorStoreIndex([])
for chunk in doc.chunks():
    index.insert(Document(text=chunk.to_context_text(), extra_info={}))
query_engine = index.as_query_engine()
response = query_engine.query("what are some key points in foundational course requirement")
print(response)

In [None]:
response = query_engine.query("what are the systems in the table for SECTION I. SYSTEMS YOU WILL BE USING AND WHY")
print(response)

## Extra: Parse through tables

In [None]:
# Table parsing method (not always perfect)
HTML(doc.tables()[11].to_html())
context = doc.tables()[11].to_html()
resp = OpenAI().complete(f"read this table and answer question: what are the systems in the table for SECTION I. SYSTEMS YOU WILL BE USING AND WHY:\n{context}")
print(resp.text)