### Mega Parse

In [10]:
from pathlib import Path
import sys
sys.path.append('..')
from megaparse.Converter import MegaParse
import os 

api_key: str | None = os.getenv("LLAMA_CLOUD_API_KEY")

converter = MegaParse(file_path="../megaparse/tests/input_tests/MegaFake_report.pdf", llama_parse_api_key=api_key)
md_content = converter.convert()
converter.save_md(md_content, Path("../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md"))


converter = MegaParse(file_path="../megaparse/tests/input_tests/MegaFake_report.pdf")
md_content = converter.convert()
converter.save_md(md_content, Path("../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md"))


Started parsing the file under job_id febd501f-09c4-497a-9152-5c36d12db1cf
Parsing table <table><thead><th>My Mega fake</th><th>report</th><th rowspan="2">|#1756394 31/05/2024</th></thead><thead><th></th><th></th><th></th></thead></table>
Table | My Mega fake | report | |#1756394 31/05/2024 |
|--------------|--------|----------------------|
|              |        |                      |

 improved
Parsing Title
{'type': 'Title', 'element_id': '816d843b751910db0a06ba8f1ffd7fc8', 'text': 'Why Mega Parse might be the best ?', 'metadata': {'detection_class_prob': 0.6813323497772217, 'coordinates': {'points': ((199.20834350585938, 463.5521545410156), (199.20834350585938, 543.973876953125), (1387.9010177670898, 543.973876953125), (1387.9010177670898, 463.5521545410156)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-06-01T21:53:28', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': '../megaparse/tests/inp

### LLama Parse

In [2]:
from typing import List
from llama_index.core.schema import Document
import nest_asyncio

nest_asyncio.apply()
#GET LLAMA_CLOUD_API_KEY
import os
from llama_parse import LlamaParse
from llama_parse.utils import ResultType, Language

api_key: str | None = os.getenv("LLAMA_CLOUD_API_KEY")

parsing_instructions = "Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables."

parser = LlamaParse(
    api_key=str(api_key), 
    result_type=ResultType.MD,
    gpt4o_mode=True,
    verbose=True,
    language=Language.FRENCH,
    parsing_instruction=parsing_instructions,  # Optionally you can define a parsing instruction
)
# sync
documents: List[Document] = parser.load_data("../megaparse/tests/input_tests/MegaFake_report.pdf")

with open("../megaparse/tests/output_tests/MegaFake_report_llama.md", "w") as f:
        f.write(documents[0].get_content())


Started parsing the file under job_id 7c1d3024-8724-488e-87ff-625275e1824d


### Unstructured

In [3]:
from langchain_community.document_loaders import UnstructuredPDFLoader
loader = UnstructuredPDFLoader("../megaparse/tests/input_tests/MegaFake_report.pdf", strategy="hi_res", infer_table_structure=True,
)
data = loader.load()

In [4]:
with open("../megaparse/tests/output_tests/MegaFake_report_unstructured.md", "w") as f:
        f.write(data[0].page_content)

### Evaluation with Diff Lib

In [7]:
import difflib

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()

def compare_files(source_path, target_path):
    source_lines = read_file(source_path)
    target_lines = read_file(target_path)

    diff = difflib.unified_diff(
    source_lines,
    target_lines,
    fromfile='target.md',
    tofile='generated.md',
    lineterm=''
    )

    modifications = 0
    for line in diff:
        #print(line)
        if line.startswith('+') and not line.startswith('+++'):
            modifications += 1
        elif line.startswith('-') and not line.startswith('---'):
            modifications += 1

    return modifications
    
diff_megaparse_unstructured = compare_files("../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md", "../megaparse/tests/output_tests/MegaFake_report.md")
diff_megaparse_llama = compare_files("../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md", "../megaparse/tests/output_tests/MegaFake_report.md")
diff_llamaparse = compare_files("../megaparse/tests/output_tests/MegaFake_report_llama.md", "../megaparse/tests/output_tests/MegaFake_report.md")
diff_unstructured = compare_files("../megaparse/tests/output_tests/MegaFake_report_unstructured.md", "../megaparse/tests/output_tests/MegaFake_report.md")

In [9]:
print(f"Diff megaparse unstructured: {diff_megaparse_unstructured}")
print(f"Diff megaparse llama: {diff_megaparse_llama}")
print(f"Diff llama parse: {diff_llamaparse}")



Diff megaparse unstructured: 120
Diff megaparse llama: 26
Diff llama parse: 31
