### Mega Parse

In [1]:
from pathlib import Path
import sys
sys.path.append('..')
from megaparse.Converter import MegaParse
import os 

api_key: str | None = os.getenv("LLAMA_CLOUD_API_KEY")

converter = MegaParse(file_path="../megaparse/tests/input_tests/MegaFake_report.pdf", llama_parse_api_key=api_key)
md_content = converter.convert()
converter.save_md(md_content, Path("../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md"))

converter = MegaParse(file_path="../megaparse/tests/input_tests/MegaFake_report.pdf", llama_parse_api_key=api_key)
md_content = converter.convert(gpt4o_cleaner = True)
converter.save_md(md_content, Path("../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse_gptcleaner.md"))


converter = MegaParse(file_path="../megaparse/tests/input_tests/MegaFake_report.pdf")
md_content = converter.convert()
converter.save_md(md_content, Path("../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md"))


Started parsing the file under job_id e5e0367d-2f83-4e4d-84e5-4d5df7119516
Started parsing the file under job_id 0b5d66aa-bbab-454b-b256-82495d20f91f


Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### LLama Parse

In [2]:
from typing import List
from llama_index.core.schema import Document
import nest_asyncio

nest_asyncio.apply()
#GET LLAMA_CLOUD_API_KEY
import os
from llama_parse import LlamaParse
from llama_parse.utils import ResultType, Language

api_key: str | None = os.getenv("LLAMA_CLOUD_API_KEY")

parsing_instructions = "Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables."

parser = LlamaParse(
    api_key=str(api_key), 
    result_type=ResultType.MD,
    gpt4o_mode=True,
    verbose=True,
    language=Language.FRENCH,
    parsing_instruction=parsing_instructions,  # Optionally you can define a parsing instruction
)
# sync
documents: List[Document] = parser.load_data("../megaparse/tests/input_tests/MegaFake_report.pdf")

with open("../megaparse/tests/output_tests/MegaFake_report_llama.md", "w") as f:
        f.write(documents[0].get_content())


Started parsing the file under job_id f78ee794-ffde-4e0a-938d-987f1b22cfcb


### Unstructured

In [3]:
from langchain_community.document_loaders import UnstructuredPDFLoader
loader = UnstructuredPDFLoader("../megaparse/tests/input_tests/MegaFake_report.pdf", strategy="hi_res", infer_table_structure=True,
)
data = loader.load()

In [4]:
with open("../megaparse/tests/output_tests/MegaFake_report_unstructured.md", "w") as f:
        f.write(data[0].page_content)

### Evaluation with Diff Lib

In [11]:
import difflib
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()

def compare_files(source_path, target_path, with_formatting=False):
    source_lines = read_file(source_path)
    target_lines = read_file(target_path)
    if not with_formatting:
        source_lines = [line.replace("*","") for line in source_lines]
        target_lines = [line.replace("*","") for line in target_lines]

    diff = difflib.unified_diff(
    source_lines,
    target_lines,
    fromfile='target.md',
    tofile='generated.md',
    lineterm=''
    )

    modifications = 0
    for line in diff:
        #print(line)
        if line.startswith('+') and not line.startswith('+++'):
            modifications += 1
        elif line.startswith('-') and not line.startswith('---'):
            modifications += 1

    return modifications
    
diff_megaparse_unstructured = compare_files("../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md", "../megaparse/tests/output_tests/MegaFake_report.md")
diff_megaparse_llama_gptcleaner = compare_files("../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse_gptcleaner.md", "../megaparse/tests/output_tests/MegaFake_report.md")
diff_megaparse_llama = compare_files("../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md", "../megaparse/tests/output_tests/MegaFake_report.md")
diff_llamaparse = compare_files("../megaparse/tests/output_tests/MegaFake_report_llama.md", "../megaparse/tests/output_tests/MegaFake_report.md")
diff_unstructured = compare_files("../megaparse/tests/output_tests/MegaFake_report_unstructured.md", "../megaparse/tests/output_tests/MegaFake_report.md")
diff_megaparse_llm = compare_files("../megaparse/tests/output_tests/MegaFake_report_llm_megaparse.md", "../megaparse/tests/output_tests/MegaFake_report.md")

In [12]:
diff_results = {
    "**Megaparse**": diff_megaparse_unstructured,
    "Megaparse with LLamaParse": diff_megaparse_llama,
    "Megaparse with LLamaParse and GPTCleaner": diff_megaparse_llama_gptcleaner,
    "LMM megaparse": diff_megaparse_llm,
    "LLama Parse": diff_llamaparse
}

# Sort the results
sorted_diff_results = sorted(diff_results.items(), key=lambda x: x[1])

# Generate a table with the results
benchmark_results = "| Parser | Diff |\n|---|---|\n"
for parser, diff in sorted_diff_results:
    benchmark_results += f"| {parser} | {diff} |\n"

# Update README.md file
with open("../README.md", "r") as readme_file:
    readme_content = readme_file.read()

start_marker = "<!---BENCHMARK-->"
end_marker = "<!---END_BENCHMARK-->"
start_index = readme_content.find(start_marker) + len(start_marker)
end_index = readme_content.find(end_marker)

updated_readme_content = readme_content[:start_index] + "\n" + benchmark_results + readme_content[end_index:]

with open("../README.md", "w") as readme_file:
    readme_file.write(updated_readme_content)