# Load Parsing Data

In [1]:
import nest_asyncio
import os

from IPython.utils import docs
from llama_parse import LlamaParse


nest_asyncio.apply()

LLAMA_CLOUD_API_KEY = 'llx-DfP7SdMqvrHBMQZ9LhPVZvABLjFMO1evLL79ncWk7JVEBwgw'
os.environ["LLAMA_CLOUD_API_KEY"] = LLAMA_CLOUD_API_KEY

In [2]:
doc = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="openai-gpt-4o-mini",
    # vendor_multimodal_model_name="openai-gpt4o",
).load_data("../PDF_parsing_libs_recap/_data_test_pdfs/single-page-diff-u_s_tables.pdf")
print(f"Result docs number: {len(doc)}")

Error while parsing the file '../PDF_parsing_libs_recap/_data_test_pdfs/single-page-diff-u_s_tables.pdf': [Errno 2] No such file or directory: 'D:/documents/python_projects/py_libs_recap/Markdown_parsing/bugs/../PDF_parsing_libs_recap/_data_test_pdfs/single-page-diff-u_s_tables.pdf'
Result docs number: 0


# Parser

In [3]:
from typing import List, Tuple, Union

from enum import Enum

import re
import pandas as pd


class PDFPartsTypes(Enum):
    """
    Represents the types of PDF parts.
    """

    TEXT = 0
    TABLE = 1
    

MIN_TEXT_SIZE = 100


class MDParser:
    @staticmethod
    def _parse_md_table(md_table: str) -> pd.DataFrame:
        """
        Parses a Markdown table string and converts it into a Pandas DataFrame.

        :param md_table: Markdown string which represent a table
        :return: Pandas DataFrame -- parsed table
        """
        lines = md_table.strip().split("\n")
        headers = [h.strip() for h in re.split(r'\s*\|\s*', lines[0]) if h]  # Split header row by '|'
        rows = [re.split(r'\s*\|\s*', line) for line in lines[2:]]  # Split data rows by '|'
        data = [[cell.strip() for cell in row if cell] for row in rows]  # Clean up cells
        
        # fix columns and data inconsistency
        # it if weak fixing of possible bug
        if data and len(headers) != len(data[0]):
            headers = headers[:len(data[0])]
        return pd.DataFrame(data, columns=headers)
    
    @staticmethod
    def _merge_tables(tables: List[pd.DataFrame]) -> List[pd.DataFrame]:
        def infer_column_types(df):
            inferred_types = []
            for col in df.columns:
                try:
                    # Try numeric conversion
                    pd.to_numeric(df[col])
                    inferred_types.append('num')
                except:
                    try:
                        # Try datetime conversion
                        pd.to_datetime(df[col])
                        inferred_types.append('dt')
                    except:
                        inferred_types.append('str')
            return inferred_types
        
        inx = 0
        
        while (inx + 1) < len(tables):
            # try to merge tables[inx] and tables[inx + 1]
            # check if they have the same number of columns
            # check if their columns have the similar type
            # !!! WARNING !!! It was supposed that the first table has the right columns names
            # !!! WARNING !!! and following similar table has hallucinated columns names
            if tables[inx].shape[1] == tables[inx + 1].shape[1]:
                col_types_t0 = infer_column_types(tables[inx])
                col_types_t1 = infer_column_types(tables[inx + 1])
                if col_types_t0 == col_types_t1:
                    tables[inx] = pd.concat([tables[inx], tables[inx + 1]], ignore_index=True).reset_index(drop=True)
                    del tables[inx + 1]
                    continue # prevent considering the next table
            inx += 1
        return tables

    @staticmethod
    def parse_md_page(md_text: str) -> List[Tuple[Union[pd.DataFrame, str], PDFPartsTypes]]:
        """
        Extracts tables and text from Markdown in the same order they appear.
        Tables are returned as Pandas DataFrames, and text parts as strings.

        :param md_text: Markdown string which represent PDF-document or PDF-page
        :return: List[Tuple[Union[pd.DataFrame, str], PDFPartsTypes]]
        """
        content = []

        # Regular expression to match Markdown tables, allowing for optional spaces and missing cells
        table_pattern = re.compile(
            r'(\|[^\n]+\|\n(?:\|[^\n]+\|\n)*)',
            re.MULTILINE
        )
        
        # Split the text by tables
        parts = table_pattern.split(md_text)

        for part in parts:
            part = part.strip()
            if table_pattern.match(part):  # It's a table
                df = MDParser._parse_md_table(part)
                content.append((df, PDFPartsTypes.TABLE))
            elif part:  # Non-empty text
                content.append((str(part), PDFPartsTypes.TEXT))

        return content

    @staticmethod
    def parse_md_doc(md_texts: List[str]) -> List[Tuple[Union[str, pd.DataFrame], PDFPartsTypes]]:
        text_parts_processed = []
        page_parts = []
        combined_md_text = ""
        collected_tables = []
        
        # Here might be a table parsing logic which use table position in text,
        # so I keep pages after collecting tables
        for md_text in md_texts:
            page_parts.append(MDParser.parse_md_page(md_text))

        # Cleaning redundant text parts and table combining procedure
        for page_part in page_parts:
            for pdata, pd_type in page_part:
                if pd_type == PDFPartsTypes.TEXT:
                    # DO NOT add to final text too small pieces of text
                    # usually they are redundant
                    pdata = pdata.strip()
                    combined_md_text += pdata + "\n" if (len(combined_md_text) > 0 or len(pdata) > MIN_TEXT_SIZE) else ""
                elif pd_type == PDFPartsTypes.TABLE:
                    collected_tables.append(pdata)
        collected_tables = MDParser._merge_tables(collected_tables)
        
        if combined_md_text:
            text_parts_processed.append((combined_md_text, PDFPartsTypes.TEXT))
        text_parts_processed.extend([(df_table, PDFPartsTypes.TABLE) for df_table in collected_tables])
        
        return text_parts_processed

In [4]:
res = MDParser.parse_md_doc([doc[inx].text for inx in range(len(doc))])

  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])
  pd.to_datetime(df[col])


In [5]:
for res_single in res:
    print(res_single[0].shape)

(32, 5)
(8, 11)
(9, 11)
(8, 10)
(9, 11)
(8, 11)
(4, 11)
(21, 6)


In [None]:
my_list = [1, 2, 1, 1]

if all(2 == x for x in my_list):
    print("All elements are the same.")
else:
    print("Elements are different.")

In [None]:
print(doc[4].text)