In [1]:
from mongoengine import connect, disconnect
from docai.documents import BillOfLading
from loguru import logger
import os
import pandas as pd 

MONGO_HOST = "mongodb://localhost:27020"

MONGO_DB_CONFIG = {"host": MONGO_HOST, "db": "docai", "alias": "docai"}

def connect_to_mongo():
    try:
        connect(**MONGO_DB_CONFIG)
        logger.info("Connection to Mongo Successful")
    except:
        raise ConnectionError("Could not connect to mongo")
    finally:
        disconnect()

connect_to_mongo()

[32m2024-08-05 08:55:39.893[0m | [1mINFO    [0m | [36m__main__[0m:[36mconnect_to_mongo[0m:[36m14[0m - [1mConnection to Mongo Successful[0m


In [2]:
documents = ['60212192-Bill-of-Lading.pdf', '258379298-BL-12-20-Box.pdf', '429616980-135085879-Bill-of-Lading-Copy-pdf.pdf']
doc_parsers = ['pymupdf', 'llama_parse']
entity_extractor_models = ['llama3.1', 'gpt-4o', 'gpt-4o-mini']

In [3]:
for document in documents:
    for doc_parser in doc_parsers:
        for entity_extractor_model in entity_extractor_models:
            file = BillOfLading.objects(meta_info__document_name=document,
                                        meta_info__doc_parser_model=doc_parser,
                                        meta_info__entity_extractor_model=entity_extractor_model).first().to_json()
            
            os.makedirs(f"outputs/{doc_parser}/{entity_extractor_model}/", exist_ok=True)
            file_path = f"outputs/{doc_parser}/{entity_extractor_model}/{document}.json"
            with open(file_path, "w") as json_file:
                json_file.write(file)

In [4]:
df = pd.DataFrame(columns = ["document_name", "document_parser", "entity_extractor_model", "shipper_name", "shipper_address", "port_of_loading", "port_of_discharge"])
values = []
for document in documents:
    for doc_parser in doc_parsers:
        for entity_extractor_model in entity_extractor_models:
            file = BillOfLading.objects(meta_info__document_name=document,
                                        meta_info__doc_parser_model=doc_parser,
                                        meta_info__entity_extractor_model=entity_extractor_model).first()
            
            values.append([document,
                           doc_parser,
                           entity_extractor_model,
                           file.shipper.name,
                           file.shipper.address,
                           file.port_of_loading,
                           file.port_of_discharge])
            
output_df = df = pd.DataFrame(values, columns = ["document_name", "document_parser", "entity_extractor_model", "shipper_name", "shipper_address", "port_of_loading", "port_of_discharge"])

output_df

Unnamed: 0,document_name,document_parser,entity_extractor_model,shipper_name,shipper_address,port_of_loading,port_of_discharge
0,60212192-Bill-of-Lading.pdf,pymupdf,llama3.1,"JULISSA CASTING, CORP",AV. MONSERRATE FR 11 VILLA FONTANA,"VERACRUZ, MEXICO","VERACRUZ, MEXICO"
1,60212192-Bill-of-Lading.pdf,pymupdf,gpt-4o,"JULISSA CASTING, CORP",AV. MONSERRATE FR 11 VILLA FONTANA,"VERACRUZ, MEXICO","VERACRUZ, MEXICO"
2,60212192-Bill-of-Lading.pdf,pymupdf,gpt-4o-mini,"NESTOR REYES, INC","EDIFICIO ILA, 1055 JF KENNEDY AVE. STE. 801","SAN JUAN, PUERTO RICO","VERACRUZ, MEXICO"
3,60212192-Bill-of-Lading.pdf,llama_parse,llama3.1,JUANA CARACHURE VENCES,"APOLINAR MENDOZA 65, SAN JOSE DEL JARAL","VERACRUZ, MEXICO","SAN JUAN, PUERTO RICO"
4,60212192-Bill-of-Lading.pdf,llama_parse,gpt-4o,JUANA CARACHURE VENCES,"APOLINAR MENDOZA 65, SAN JOSE DEL JARAL","VERACRUZ, MEXICO","SAN JUAN, PUERTO RICO"
5,60212192-Bill-of-Lading.pdf,llama_parse,gpt-4o-mini,JUANA CARACHURE VENCES,"APOLINAR MENDOZA 65, SAN JOSE DEL JARAL","VERACRUZ, MEXICO","SAN JUAN, PUERTO RICO"
6,258379298-BL-12-20-Box.pdf,pymupdf,llama3.1,MIRAGE GRANITO CERAMICO S.P.A.,VIA GIARDINI NORD 225 MO,"LIVORNO, ITALY","MARACAIBO, VENEZUELA"
7,258379298-BL-12-20-Box.pdf,pymupdf,gpt-4o,MIRAGE GRANITO CERAMICO S.P.A.,VIA GIARDINI NORD 225 MO,"LIVORNO, ITALY","MARACAIBO, VENEZUELA"
8,258379298-BL-12-20-Box.pdf,pymupdf,gpt-4o-mini,MIRAGE GRANITO CERAMICO S.P.A.,VIA GIARDINI NORD 225 MO,GENOA,"LIVORNO, ITALY"
9,258379298-BL-12-20-Box.pdf,llama_parse,llama3.1,MIRAGE GRANITO CERAMICO S.P.A.,VIA GIARDINI NORD 225 MO41026 PAVULLO,"LIVORNO, ITALY","MARACAIBO, VENEZUELA"


In [5]:
output_df.to_excel("outputs/entity_extractor_fields_of_interest.xlsx", index = False)