# Load Parsing Data

In [1]:
import nest_asyncio
import os

from IPython.utils import docs
from llama_parse import LlamaParse


nest_asyncio.apply()

LLAMA_CLOUD_API_KEY = ''
os.environ["LLAMA_CLOUD_API_KEY"] = LLAMA_CLOUD_API_KEY

# Parser

In [106]:
from typing import List, Tuple, Union

from enum import Enum

import re
import pandas as pd


class PDFPartsTypes(Enum):
    """
    Represents the types of PDF parts.
    """

    TEXT = 0
    TABLE = 1
    

MIN_TEXT_SIZE = 100


class MDParser:
    @staticmethod
    def _parse_md_table(md_table: str) -> pd.DataFrame | None:
        """
        Parses a Markdown table string and converts it into a Pandas DataFrame.

        :param md_table: Markdown string which represent a table
        :return: Pandas DataFrame -- parsed table
        """
        lines = md_table.strip().split("\n")
        headers = [h.strip() for h in lines[0].strip().split("|")[1:-1]]  # Split header row by '|'
        rows = [line.strip().split("|")[1:-1] for line in lines[2:]]    # Split data rows by '|'
        data = [[cell.strip() for cell in row if cell] for row in rows]  # Clean up cells        
        
        try:
            res_df = pd.DataFrame(data, columns=headers)
        except:
            res_df = None
        return res_df
    
    @staticmethod
    def _merge_tables(tables: List[pd.DataFrame]) -> List[pd.DataFrame]:
        def infer_column_types(df):
            inferred_types = []
            for col in df.columns:
                try:
                    # Try numeric conversion
                    pd.to_numeric(df[col])
                    inferred_types.append('num')
                except:
                    try:
                        # Try datetime conversion
                        pd.to_datetime(df[col], format='mixed')
                        inferred_types.append('dt')
                    except:
                        inferred_types.append('str')
            return inferred_types
        
        inx = 0
        
        while (inx + 1) < len(tables):
            # try to merge tables[inx] and tables[inx + 1]
            # check if they have the same number of columns
            # check if their columns have the similar type
            # !!! WARNING !!! It was supposed that the first table has the right columns names
            # !!! WARNING !!! and following similar table has hallucinated columns names
            if tables[inx].shape[1] == tables[inx + 1].shape[1]:
                col_types_t0 = infer_column_types(tables[inx])
                col_types_t1 = infer_column_types(tables[inx + 1])
                if col_types_t0 == col_types_t1:
                    tables[inx + 1].columns = tables[inx].columns
                    tables[inx] = pd.concat([tables[inx], tables[inx + 1]], ignore_index=True).reset_index(drop=True)
                    del tables[inx + 1]
                    
                    continue # prevent considering the next table    
            inx += 1
        return tables

    @staticmethod
    def parse_md_page(md_text: str) -> List[Tuple[Union[pd.DataFrame, str], PDFPartsTypes]]:
        """
        Extracts tables and text from Markdown in the same order they appear.
        Tables are returned as Pandas DataFrames, and text parts as strings.

        :param md_text: Markdown string which represent PDF-document or PDF-page
        :return: List[Tuple[Union[pd.DataFrame, str], PDFPartsTypes]]
        """
        content = []

        # Regular expression to match Markdown tables, allowing for optional spaces and missing cells
        table_pattern = re.compile(
            r'(\|[^\n]+\|\n(?:\|[^\n]+\|\n)*)',
            re.MULTILINE
        )
        
        # Split the text by tables
        parts = table_pattern.split(md_text + "\n")

        for part in parts:
            part = part.strip()
            if table_pattern.match(part):  # It's a table
                df = MDParser._parse_md_table(part)
                # check if table parsing is successful
                content.append((df, PDFPartsTypes.TABLE) if df is not None else (part.strip(), PDFPartsTypes.TEXT))
            elif part:  # Non-empty text
                content.append((str(part), PDFPartsTypes.TEXT))

        return content

    @staticmethod
    def parse_md_doc(md_texts: List[str]) -> List[Tuple[Union[str, pd.DataFrame], PDFPartsTypes]]:
        text_parts_processed = []
        page_parts = []
        combined_md_text = ""
        collected_tables = []
        
        # Here might be a table parsing logic which use table position in text,
        # so I keep pages after collecting tables
        for md_text in md_texts:
            page_parts.append(MDParser.parse_md_page(md_text))

        # Cleaning redundant text parts and table combining procedure
        for page_part in page_parts:
            for pdata, pd_type in page_part:
                if pd_type == PDFPartsTypes.TEXT:
                    # DO NOT add to final text too small pieces of text
                    # usually they are redundant
                    pdata = pdata.strip()
                    combined_md_text += pdata + "\n" if (len(combined_md_text) > 0 or len(pdata) > MIN_TEXT_SIZE) else ""
                elif pd_type == PDFPartsTypes.TABLE:
                    collected_tables.append(pdata)
        collected_tables = MDParser._merge_tables(collected_tables)
        
        if combined_md_text:
            text_parts_processed.append((combined_md_text, PDFPartsTypes.TEXT))
        text_parts_processed.extend([(df_table, PDFPartsTypes.TABLE) for df_table in collected_tables])
        
        return text_parts_processed

# Testing "single-page-diff-u_s_tables.pdf"

In [89]:
doc = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    # vendor_multimodal_model_name="openai-gpt-4o-mini",
    vendor_multimodal_model_name="openai-gpt4o",
).load_data("./_data_test_pdfs_custom/single-page-diff-u_s_tables.pdf")
print(f"Result docs number: {len(doc)}")

Started parsing the file under job_id 00e6c69d-17d7-41d4-9212-7691a672151d
Result docs number: 8


In [107]:
res = MDParser.parse_md_doc([doc[inx].text for inx in range(len(doc))])

In [108]:
for res_single in res:
    print(res_single[1], res_single[0].shape)

PDFPartsTypes.TABLE (32, 5)
PDFPartsTypes.TABLE (46, 11)
PDFPartsTypes.TABLE (21, 6)


In [109]:
res[0][0].head()

Unnamed: 0,Date,Description,Revenue (USD),Expenses (USD),Net Profit (USD)
0,2023-12-01,Daily Transactions,"$1,200",$955,$245
1,2023-12-02,Daily Transactions,"$1,500","$1,082",$418
2,2023-12-03,Daily Transactions,"$1,100",$861,$239
3,2023-12-04,Daily Transactions,"$1,342","$1,068",$274
4,2023-12-05,Daily Transactions,"$1,410","$1,180",$230


In [110]:
res[1][0].head()

Unnamed: 0,Product ID,Product Name,Description,Quantity in Stock,Unit Price,Selling Price,Expiry Date,Date of Purchase,Date of Last Sale,Supplier,Discount
0,1,Apple,"Fresh, crisp apples sourced from local orchard...",94,$1.00,1.24,02-01-2024,26-11-2023,01-12-2023,Fresh Fruit Co.,0.117748952
1,2,Banana,"Ripe, sweet bananas imported from tropical reg...",68,$0.50,0.62,05-12-2023,19-11-2023,01-12-2023,Fresh Fruit Co.,0.081272608
2,3,Bread,"Artisanal bread made with locally-sourced, hig...",10,$2.00,2.48,17-12-2023,25-11-2023,19-11-2023,Bakery Delights,0.158817645
3,4,Milk,"Fresh, creamy milk from pasture-raised cows fo...",95,$3.00,3.72,31-12-2023,24-11-2023,01-12-2023,Dairy Farm,0.011311604
4,5,Egg,"Farm-fresh eggs from free-range chickens, ensu...",91,$0.25,0.31,09-12-2023,23-11-2023,20-11-2023,Eggcellent Suppliers,0.015065837


In [111]:
res[2][0].head()

Unnamed: 0,Employee,Phone,Email,Role,Hire Date,Salary
0,Sarah Lee,(415) 555-2671,sarahlee@email.com,Cashier,3-Jan-2018,"$2,000"
1,Tom Johnson,(212) 555-3456,tomjohnson@email.com,Cashier,15-Feb-2019,"$2,000"
2,Jane Smith,(312) 555-7890,janesmith@email.com,Cashier,10-Apr-2017,"$2,000"
3,Mike Johnson,(213) 555-4567,mikejohnson@email.com,Cashier,22-May-2018,"$2,000"
4,John Smith,(305) 555-6789,johnsmith@email.com,CEO,30-Jun-2019,"$20,000"


# Testing "consum_order_payment.pdf"

In [112]:
doc = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    # vendor_multimodal_model_name="openai-gpt-4o-mini",
    vendor_multimodal_model_name="openai-gpt4o",
).load_data("./_data_test_pdfs_custom/consum_order_payment.pdf")
print(f"Result docs number: {len(doc)}")

Started parsing the file under job_id 110410e1-2f31-4cdf-9871-7f534319fbb4
Result docs number: 3


In [113]:
print(doc[0].text)


| Payment ID | Order ID | Loyalty Card | Loyalty Amount Applied | Customer       | Amount Payed | Date       | Type   | Card       | Invoice ID | Status     |
|------------|----------|--------------|------------------------|----------------|--------------|------------|--------|------------|------------|------------|
| 209864     | 1        | -            |                        | John Smith     | $50.00       | 01-May-21  | Cash   |            | 656625     | Completed  |
| 615068     | 3        | 401364       | 5                      | Bob Johnson    | $15          | 03-May-21  | Online | Visa       | 129687     | Failed     |
| 889065     | 4        | -            |                        | Mary Brown     | $30.00       | 04-May-21  | Cash   |            | 429780     | Completed  |
| 743665     | 5        | 245888       |                        | David Lee      | $15.00       | 05-May-21  | Cash   |            | 960672     | Pending    |
| 541305     | 8        | 417384       | 3   

In [114]:
print(doc[1].text)


| Order ID | Status      | Customer        | Cashier     | Total | Date      | Items                          | Notes                                           |
|----------|-------------|-----------------|-------------|-------|-----------|--------------------------------|-------------------------------------------------|
| 1        | Completed   | John Smith      | Sarah Lee   | $50   | 01-May-21 | Apples, Bananas, Bread         |                                                 |
| 2        | Completed   | Jane Doe        | Tom Johnson | $75   | 02-May-21 | Milk, Eggs, Cheese, Yogurt     |                                                 |
| 3        | In Progress | Bob Johnson     | Sarah Lee   | $20   | 03-May-21 | Chips, Salsa                   |                                                 |
| 4        | Completed   | Mary Brown      | Tom Johnson | $30   | 04-May-21 | Chicken, Broccoli              |                                                 |
| 5        | Completed   | 

In [115]:
print(doc[2].text)


|    | Status      | Customer Name   | Salesperson Name | Amount | Date      | Items                      | Notes                                 |
|----|-------------|-----------------|------------------|--------|-----------|----------------------------|---------------------------------------|
| 29 | Pending     | Ava Williams    | Sarah Lee        | $160   | 09-Jan-22 | Coffee, Sugar, Cream       |                                       |
| 30 | Completed   | Michael Davis   | Tom Johnson      | $110   | 10-Jan-22 | Tea, Honey, Lemon          |                                       |
| 31 | Completed   | Sarah Lee       | Tom Johnson      | $150   | 01-Jan-22 | Apples, Milk, Bread        |                                       |
| 32 | In Progress | Jane Smith      | Mike Johnson     | $200   | 01-Feb-22 | Eggs, Butter, Cheese       |                                       |
| 33 | Delivered   | Mike Johnson    | Sarah Lee        | $120   | 01-Mar-22 | Rice, Chicken, Oranges     |    

In [116]:
res = MDParser.parse_md_doc([doc[inx].text for inx in range(len(doc))])

In [118]:
for inx, res_single in enumerate(res):
    if res_single[1] is PDFPartsTypes.TABLE:
        print(f"{inx}:", res_single[1], res_single[0].shape)
    else:
        print(f"{inx}:", res_single[1], len(res_single[0]))

0: PDFPartsTypes.TABLE (25, 11)
1: PDFPartsTypes.TABLE (50, 8)


In [119]:
res[0][0].head()

Unnamed: 0,Payment ID,Order ID,Loyalty Card,Loyalty Amount Applied,Customer,Amount Payed,Date,Type,Card,Invoice ID,Status
0,209864,1,-,,John Smith,$50.00,01-May-21,Cash,,656625,Completed
1,615068,3,401364,5.0,Bob Johnson,$15,03-May-21,Online,Visa,129687,Failed
2,889065,4,-,,Mary Brown,$30.00,04-May-21,Cash,,429780,Completed
3,743665,5,245888,,David Lee,$15.00,05-May-21,Cash,,960672,Pending
4,541305,8,417384,3.0,Emily Davis,$22,08-May-21,Online,MasterCard,892412,Pending


In [121]:
res[1][0].head()

Unnamed: 0,Order ID,Status,Customer,Cashier,Total,Date,Items,Notes
0,1,Completed,John Smith,Sarah Lee,$50,01-May-21,"Apples, Bananas, Bread",
1,2,Completed,Jane Doe,Tom Johnson,$75,02-May-21,"Milk, Eggs, Cheese, Yogurt",
2,3,In Progress,Bob Johnson,Sarah Lee,$20,03-May-21,"Chips, Salsa",
3,4,Completed,Mary Brown,Tom Johnson,$30,04-May-21,"Chicken, Broccoli",
4,5,Completed,David Lee,Sarah Lee,$15,05-May-21,"Soda, Chips",


# Testing "consum_combined.pdf"

In [122]:
doc = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    # vendor_multimodal_model_name="openai-gpt-4o-mini",
    vendor_multimodal_model_name="openai-gpt4o",
).load_data("./_data_test_pdfs_custom/consum_combined.pdf")
print(f"Result docs number: {len(doc)}")

Started parsing the file under job_id e9141a04-93da-4152-9d4e-ad0e43e256e8
Result docs number: 3


In [123]:
print(doc[0].text)

# Consum Worker Policy

1. **Introduction**

   Welcome to Consum! Our Worker Policy ensures a supportive, efficient, and safe work environment. We are committed to upholding professionalism, respect, and excellence in all aspects of our operations.

2. **Equal Opportunity and Diversity**

   Consum is an equal opportunity employer. We embrace diversity and strive to create an inclusive environment for all employees.

3. **Health and Safety**

   Employee health and safety are paramount. Adherence to safety protocols, including food handling and hygiene practices, is required.

4. **Professional Conduct**

   Employees are expected to maintain professionalism, respect, and a customer-first approach at all times.

5. **Attendance and Punctuality**

   Reliable attendance and punctuality are essential. Timely communication is required in case of delays or absences.

6. **Confidentiality and Data Protection**

   All employees must respect the confidentiality of company and customer infor

In [124]:
print(doc[1].text)

Annual Leave: Employees are entitled to 24 days of paid vacation annually.

Public Holidays: All national public holidays are observed with paid leave.

Personal Days: 7 paid personal days per year for personal matters or emergencies.

### 14. Special Benefits

- **Health and Wellness Programs:** Access to gym memberships, yoga classes, and wellness workshops.
- **Professional Growth Fund:** An annual allowance for courses or seminars related to personal and professional development.
- **Family Support:** Enhanced maternity/paternity leave and flexible scheduling for parents.
- **Employee Discount:** Generous discounts on all Consum products.
- **Team Building Activities:** Regular company outings and team-building events to foster a sense of community.
- **Sustainability Incentives:** Rewards for employees who contribute to our sustainability initiatives, like recycling and energy conservation.

### 15. Retirement Plan

- **Pension Scheme:** A competitive pension plan to support your 

In [125]:
print(doc[2].text)

| Patricia Garcia   | (333) 444-5555 | patriciagarcia@email.com     |
|-------------------|----------------|-----------------------------|
| Paul Lewis        | (888) 999-0000 | paullewis@email.com         |
| Susan Walker      | (999) 000-1111 | susanwalker@email.com       |
| Xander Blackwood  | (415) 555-2672 | xander.blackwood@email.com  |
| Luna Nightingale  | (212) 555-3453 | luna.nightingale@email.com  |
| Phoenix Blaze     | (312) 555-7891 | phoenix.blaze@email.com     |
| Nova Starlight    | (213) 555-4563 | nova.starlight@email.com    |

# Table of Consum Employee Salaries, Their Roles and Hire Date.

| Employee          | Role                        | Hire Date   | Salary  |
|-------------------|-----------------------------|-------------|---------|
| Sarah Lee         | Cashier                     | 3-Jan-2018  | $2,000  |
| Tom Johnson       | Cashier                     | 15-Feb-2019 | $2,000  |
| Jane Smith        | Cashier                     | 10-Apr-2017 | $2,000  |
|

In [126]:
res = MDParser.parse_md_doc([doc[inx].text for inx in range(len(doc))])

In [127]:
for inx, res_single in enumerate(res):
    if res_single[1] is PDFPartsTypes.TABLE:
        print(f"{inx}:", res_single[1], res_single[0].shape)
    else:
        print(f"{inx}:", res_single[1], len(res_single[0]))

0: PDFPartsTypes.TEXT 3151
1: PDFPartsTypes.TABLE (20, 3)
2: PDFPartsTypes.TABLE (21, 4)


In [128]:
print(res[0][0])

# Consum Worker Policy

1. **Introduction**

   Welcome to Consum! Our Worker Policy ensures a supportive, efficient, and safe work environment. We are committed to upholding professionalism, respect, and excellence in all aspects of our operations.

2. **Equal Opportunity and Diversity**

   Consum is an equal opportunity employer. We embrace diversity and strive to create an inclusive environment for all employees.

3. **Health and Safety**

   Employee health and safety are paramount. Adherence to safety protocols, including food handling and hygiene practices, is required.

4. **Professional Conduct**

   Employees are expected to maintain professionalism, respect, and a customer-first approach at all times.

5. **Attendance and Punctuality**

   Reliable attendance and punctuality are essential. Timely communication is required in case of delays or absences.

6. **Confidentiality and Data Protection**

   All employees must respect the confidentiality of company and customer infor

In [129]:
res[1][0].head()

Unnamed: 0,Employee,Phone,Email
0,Sarah Lee,(415) 555-2671,sarahlee@email.com
1,Tom Johnson,(212) 555-3456,tomjohnson@email.com
2,Jane Smith,(312) 555-7890,janesmith@email.com
3,Mike Johnson,(213) 555-4567,mikejohnson@email.com
4,John Smith,(305) 555-6789,johnsmith@email.com


In [131]:
res[2][0].head()

Unnamed: 0,Employee,Role,Hire Date,Salary
0,Sarah Lee,Cashier,3-Jan-2018,"$2,000"
1,Tom Johnson,Cashier,15-Feb-2019,"$2,000"
2,Jane Smith,Cashier,10-Apr-2017,"$2,000"
3,Mike Johnson,Cashier,22-May-2018,"$2,000"
4,John Smith,CEO,30-Jun-2019,"$20,000"


# Test "autozaz_details_orders - orders_real.pdf"

In [132]:
doc = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    # vendor_multimodal_model_name="openai-gpt-4o-mini",
    vendor_multimodal_model_name="openai-gpt4o",
).load_data("./_data_test_pdfs_custom/autozaz_details_orders - orders_real.pdf")
print(f"Result docs number: {len(doc)}")

Started parsing the file under job_id d4b22a85-4b7d-4743-ab72-df8e50fc7f72
Result docs number: 3


In [133]:
print(doc[0].text)


| order_id | product_name       | price_per_unit | units_sold | amount_of_discount_total | deal_date   | customer_name  |
|----------|--------------------|----------------|------------|--------------------------|-------------|----------------|
| 1        | Alternator         | 350            | 5          | 157.5                    | 01/01/2024  | Emily Johnson  |
| 2        | Engine Block       | 500            | 5          | 100                      | 01/02/2024  | Emily Johnson  |
| 3        | Engine Block       | 500            | 4          | 120                      | 01/03/2024  | Scott Nilson   |
| 4        | Clutch Plate       | 180            | 1          | 7.2                      | 01/04/2024  | Scott Nilson   |
| 5        | Transmission Gear  | 300            | 9          | 189                      | 01/05/2024  | Scott Nilson   |
| 6        | Steering Wheel     | 100            | 3          | 27                       | 01/06/2024  | Scott Nilson   |
| 7        | Exhaust Ma

In [134]:
print(doc[1].text)

|    | Item               | Price | Quantity | Total | Date       | Employee      |
|----|--------------------|-------|----------|-------|------------|---------------|
| 38 | Clutch Plate       | 180   | 6        | 64.8  | 02/16/2023 | Emily Johnson |
| 39 | Brake Pad          | 75    | 2        | 7.5   | 02/17/2023 | Emily Johnson |
| 40 | Steering Wheel     | 100   | 3        | 6     | 02/18/2023 | Emily Johnson |
| 41 | Exhaust Manifold   | 220   | 5        | 55    | 02/19/2023 | Emily Johnson |
| 42 | Exhaust Manifold   | 220   | 7        | 46.2  | 02/20/2023 | Emily Johnson |
| 43 | Engine Block       | 500   | 4        | 180   | 02/21/2023 | Emily Johnson |
| 44 | Transmission Gear  | 300   | 3        | 18    | 02/22/2023 | Emily Johnson |
| 45 | Engine Block       | 500   | 3        | 90    | 02/23/2023 | Emily Johnson |
| 46 | Suspension Spring  | 150   | 1        | 3     | 02/24/2023 | Emily Johnson |
| 47 | Exhaust Manifold   | 220   | 2        | 17.6  | 05/06/2023 | Emily Jo

In [135]:
print(doc[2].text)

|    | Item               | Price | Quantity | Total  | Date       | Salesperson    |
|----|--------------------|-------|----------|--------|------------|----------------|
| 76 | Brake Pad          | 75    | 1        | 6      | 06/03/2023 | Tom Yam        |
| 77 | Alternator         | 350   | 4        | 70     | 06/04/2023 | Tom Yam        |
| 78 | Radiator           | 200   | 3        | 42     | 06/05/2023 | Tom Yam        |
| 79 | Clutch Plate       | 180   | 3        | 43.2   | 06/06/2023 | Tom Yam        |
| 80 | Suspension Spring  | 150   | 2        | 15     | 06/07/2023 | Tom Yam        |
| 81 | Exhaust Manifold   | 220   | 9        | 158.4  | 06/08/2023 | Tom Yam        |
| 82 | Engine Block       | 500   | 3        | 150    | 06/09/2023 | Tom Yam        |
| 83 | Engine Block       | 500   | 9        | 405    | 06/10/2023 | Emily Johnson  |
| 84 | Engine Block       | 500   | 6        | 180    | 06/11/2023 | Emily Johnson  |
| 85 | Radiator           | 200   | 5        | 70     

In [136]:
res = MDParser.parse_md_doc([doc[inx].text for inx in range(len(doc))])

In [137]:
for inx, res_single in enumerate(res):
    if res_single[1] is PDFPartsTypes.TABLE:
        print(f"{inx}:", res_single[1], res_single[0].shape)
    else:
        print(f"{inx}:", res_single[1], len(res_single[0]))

0: PDFPartsTypes.TABLE (100, 7)


In [138]:
res[0][0].head()

Unnamed: 0,order_id,product_name,price_per_unit,units_sold,amount_of_discount_total,deal_date,customer_name
0,1,Alternator,350,5,157.5,01/01/2024,Emily Johnson
1,2,Engine Block,500,5,100.0,01/02/2024,Emily Johnson
2,3,Engine Block,500,4,120.0,01/03/2024,Scott Nilson
3,4,Clutch Plate,180,1,7.2,01/04/2024,Scott Nilson
4,5,Transmission Gear,300,9,189.0,01/05/2024,Scott Nilson
