## Form Parser 

In [1]:
import os, json
from llama_parse import LlamaParse
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage
)

from llama_index.core.workflow import (
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Event,
    Context
)
from helper import get_openai_api_key, get_llama_cloud_api_key
from IPython.display import display, HTML
from helper import extract_html_content
from llama_index.utils.workflow import draw_all_possible_flows

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
llama_cloud_api_key = get_llama_cloud_api_key()
openai_api_key = get_openai_api_key()

In [4]:
## Setting up the Parser 
parser = LlamaParse(
    api_key=llama_cloud_api_key,    
    result_type="markdown",
    content_guideline_instruction="This is a form. Create a list of all the fields that need to be filled in.",
    formatting_instruction="Return a bulleted list of the fields ONLY."
)

In [5]:
# Parse the input form to get the list of fields to be populated 
result = parser.load_data("annual_report_form.pdf")[0]

Started parsing the file under job_id b5e25f59-9da3-44dc-85d0-60c1db2e1c8a


In [6]:
# Print the result 
print(result.text)

- Organization Name
- Corporate Address
- Vision
- Total Revenue
- Total Expenses
- Total Profit
- Net Profit
- Market Capital
- Current CEO
- Current Board of Directors
- Report Summary
- Future Plans


In [8]:
# Using LLM to convert the human-readable result into a JSON Object with the list of fields 
llm = OpenAI(model="gpt-4o-mini")
raw_json = llm.complete(
    f"""
    This is a parsed form.
    Convert it into a JSON object containing only the list 
    of fields to be filled in, in the form {{ fields: [...] }}. 
    <form>{result.text}</form>. 
    Return JSON ONLY, no markdown."""
)

In [22]:
print(raw_json.text)

{"fields":["Organization Name","Corporate Address","Vision","Total Revenue","Total Expenses","Total Profit","Net Profit","Market Capital","Current CEO","Current Board of Directors","Report Summary","Future Plans"]}


In [10]:
fields = json.loads(raw_json.text)["fields"]
for field in fields:
    print(field)

Organization Name
Corporate Address
Vision
Total Revenue
Total Expenses
Total Profit
Net Profit
Market Capital
Current CEO
Current Board of Directors
Report Summary
Future Plans


In [15]:
# Enhancing the Annual Report Processing Workflow to now include the Form Filling 
class ParseFormEvent(Event):
    report_form: str

class QueryEvent(Event):
    query: str
    field: str

class ResponseEvent(Event):
    response: str

In [19]:
# Enhancing the Workflow 
class RAGWorkflow(Workflow):
    
    storage_dir = "./storage"
    llm: OpenAI
    query_engine: VectorStoreIndex

    @step
    async def set_up(self, ctx: Context, ev: StartEvent) -> ParseFormEvent:

        if not ev.annual_report_file:
            raise ValueError("No Annual Report file provided")

        if not ev.report_form:
            raise ValueError("No Report Form provided")

        # define the LLM to work with
        self.llm = OpenAI(model="gpt-4o-mini")

        # Ingesting the Data and Setting up the Query Engine 
        if os.path.exists(self.storage_dir):
            # you've already ingested the resume document
            storage_context = StorageContext.from_defaults(persist_dir=self.storage_dir)
            index = load_index_from_storage(storage_context)
        else:
            # parse and load the resume document
            documents = LlamaParse(
                api_key=llama_cloud_api_key,                
                result_type="markdown",
                content_guideline_instruction="This is a corporate annual report, gather related facts together and format it as bullet points with headers"
            ).load_data(ev.annual_report_file)
            # embed and index the documents
            index = VectorStoreIndex.from_documents(
                documents,
                embed_model=OpenAIEmbedding(model_name="text-embedding-3-small")
            )
            index.storage_context.persist(persist_dir=self.storage_dir)

        # create a query engine
        self.query_engine = index.as_query_engine(llm=self.llm, similarity_top_k=5)

        # Once the Report Processing and Index creation is complete 
        # Send out an Event to process the Report Form that is to be populated 
        return ParseFormEvent(report_form=ev.report_form)

    @step
    async def parse_form(self, ctx: Context, ev: ParseFormEvent) -> QueryEvent:
        parser = LlamaParse(
            api_key=llama_cloud_api_key,            
            result_type="markdown",
            content_guideline_instruction="This is a form. Create a list of all the fields that need to be filled in.",
            formatting_instruction="Return a bulleted list of the fields ONLY."
        )

        # get the LLM to convert the parsed form into JSON
        result = parser.load_data(ev.report_form)[0]
        raw_json = self.llm.complete(
            f"""
            This is a parsed form. 
            Convert it into a JSON object containing only the list 
            of fields to be filled in, in the form {{ fields: [...] }}. 
            <form>{result.text}</form>. 
            Return JSON ONLY, no markdown.
            """)
        fields = json.loads(raw_json.text)["fields"]

        for field in fields:
            ctx.send_event(QueryEvent(
                field = field,
                query = f"How would you answer this question from the report ? {field}"
            ))
            print("Getting answer for the ", field)
            
        # Store the number of fields for the process to wait  
        await ctx.set("total_fields", len(fields))
        return

    # Get the result for the question from the Report 
    @step
    async def ask_question(self, ctx: Context, ev: QueryEvent) -> ResponseEvent:
        response = self.query_engine.query(f"This is a question about the specific report: {ev.query}")
        return ResponseEvent(field=ev.field, response=response.response)

    # Step to fill in the Report Form 
    @step
    async def fill_in_form(self, ctx: Context, ev: ResponseEvent) -> StopEvent:

        # Get the Total No of fields present in the Form 
        total_fields = await ctx.get("total_fields")

        responses = ctx.collect_events(ev, [ResponseEvent] * total_fields)
        if responses is None:
            return None # do nothing if there's nothing to do yet

        # Joining the responses received 
        responseList = "\n".join("Field: " + r.field + "\n" + "Response: " + r.response for r in responses)

        result = self.llm.complete(f"""
            You are given a list of fields in an application form and responses to
            questions about those fields from a resume. Combine the two into a list of
            fields and succinct, factual answers to fill in those fields.

            <responses>
            {responseList}
            </responses>
        """)
        return StopEvent(result=result)

In [20]:
# Execute the Workflow 
w = RAGWorkflow(timeout=120, verbose=False)
result = await w.run(
    annual_report_file="igl_annual_report.pdf",
    report_form="annual_report_form.pdf"
)
print(result)

Started parsing the file under job_id 364063bd-1aca-4172-b16c-fed1e930cafa
Getting answer for the  Organization Name
Getting answer for the  Corporate Address
Getting answer for the  Vision
Getting answer for the  Total Revenue
Getting answer for the  Total Expenses
Getting answer for the  Total Profit
Getting answer for the  Net Profit
Getting answer for the  Market Capital
Getting answer for the  Current CEO
Getting answer for the  Current Board of Directors
Getting answer for the  Report Summary
Getting answer for the  Future Plans
Here is the combined list of fields and succinct, factual answers based on the provided responses:

1. **Organization Name**: Indraprastha Gas Limited
2. **Corporate Address**: Not explicitly mentioned; refer to the company's official website or customer service for details.
3. **Vision**: Commitment to delivering safe, reliable, and clean energy solutions with a focus on sustainability, aiming to be a leader in the sustainable energy industry.
4. **Total

In [21]:
WORKFLOW_FILE = "report_form_workflow.html"
draw_all_possible_flows(w, filename=WORKFLOW_FILE)
html_content = extract_html_content(WORKFLOW_FILE)
display(HTML(html_content), metadata=dict(isolated=True))

report_form_workflow.html
