Background: Read this [article](https://medium.com/@shivarama/layoutlmv3-from-zero-to-hero-part-1-85d05818eec4) and the [HuggingFace documentation](https://huggingface.co/docs/transformers/en/model_doc/layoutlmv3) to learn about our Layoutlmv3 model architecture

Here is the link to the shared project folder: https://drive.google.com/drive/folders/12W7B73S5dwu9HEEsS0iJ6wu9maHXYLdg

Please add the link as a folder in /content/drive/MyDrive/

# Setup

## Imports

In [2]:
# Create requirements.txt file
with open('requirements.txt', 'w') as f:
    f.write('''pandas
numpy
matplotlib
scikit-learn
opencv-python
pytesseract
lxml
shapely
Pillow
pdf2image
tqdm
seaborn
jupyter
ipykernel
evaluate
git+https://github.com/huggingface/transformers.git
datasets
seqeval
accelerate
gdown
''')

# Install dependencies using pip
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
#!pip install -q pandas numpy tqdm shapely pillow pytesseract lxml scikit-learn datasets evaluate

In [3]:
#!sudo apt-get update
#!sudo apt-get install tesseract-ocr

In [3]:
import os
import json
import glob
import ast
import re
import pandas as pd
import numpy as np
import datasets
import gdown
from tqdm import tqdm
from lxml import etree
from PIL import Image
from pathlib import Path
from pytesseract import pytesseract
from pytesseract import TesseractError
from shapely.geometry import Polygon
from datasets.features import ClassLabel
from sklearn.model_selection import train_test_split

In [2]:
pytesseract.tesseract_cmd = '/usr/bin/tesseract'

## Load in Data

In [5]:
import os
import requests
import zipfile
from pathlib import Path
import gdown

# Google Drive folder URL
folder_url = "https://drive.google.com/drive/folders/12W7B73S5dwu9HEEsS0iJ6wu9maHXYLdg?usp=sharing"

# Create a directory for the project if it doesn't exist
project_dir = Path.cwd() / "project_data"
project_dir.mkdir(exist_ok=True)

# Extract folder ID from the URL
folder_id = folder_url.split('/')[-1]

# Download the folder using gdown
print("Downloading project folder...")
gdown.download_folder(url=folder_url,
                     output=str(project_dir),
                     quiet=False,
                     use_cookies=False)

# Change working directory to the project directory
os.chdir(project_dir)
print(f"\nWorking directory set to: {os.getcwd()}")

# List downloaded contents
print("\nDownloaded contents:")
for item in os.listdir():
    print(f"- {item}")

Downloading project folder...


Retrieving folder contents


Processing file 1zOTT3tTAujWrRfF_bXQU1bCnS1zv0LkP invoices_labeling.json
Processing file 1BwmIXDrJWH7wK-hNhqHTkbqHhZYkkGPV img
Retrieving folder 1yH5Uqfq-w3tfg52u9VB-awvMwLtVc7gl layoutlmv3_hocr_output
Processing file 1-4BQT-3sHj2gB-JXkF1CHFp1gPFsqlFO invoice_1.hocr
Processing file 1-MgK3QgPBG1ap-F_cxnNdj0X0YytDUxU invoice_3.hocr
Processing file 1-SeqcSb8Z4SBZ2Ce99u2Xlu8Evba-Yeq invoice_4.hocr
Processing file 1-Wn69Hg6CfO9W1s7SErFG0F7ssV2gi81 invoice_5.hocr
Processing file 1-XRJIkFpwHc-8NZ0wwOuHlTbR3EC43a- invoice_6.hocr
Processing file 1-__3g5ufwTcS6VsYs2oGRK9SJdNkz4yo invoice_7.hocr
Processing file 1-f1licNyPKyASvwR2DhZYbMw-YTaOXt1 invoice_8.hocr
Processing file 1-hKAJiAkUeTMMd-ZYuBMosLVDjGrQLYY invoice_9.hocr
Processing file 1-kGFDAoeC7RFPqYrM0KuQzyA13uT21PF invoice_10.hocr
Processing file 1-lXn-b7RPqrAZxU5CW7mtyyuPxV7EM-2 invoice_11.hocr
Processing file 1-xsINNMyk3i0iVmDKDmHiyfCYX6XZ_st invoice_12.hocr
Processing file 101W_B-5Se50N_cbm_E8c4fWw0CDsYUNY invoice_13.hocr
Processing fil

Retrieving folder contents completed
Building directory structure
Building directory structure completed


FileURLRetrievalError: Failed to retrieve file url:

	Only the owner and editors can download this file. If you'd like to
	download it, please contact the owner.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1zOTT3tTAujWrRfF_bXQU1bCnS1zv0LkP

but Gdown can't. Please check connections and permissions.

In [6]:
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
#labeled_images_path = '/content/drive/MyDrive/SharedStatsProject/invoices_labeling.json'
#images_path = '/content/drive/MyDrive/SharedStatsProject/img'

current_dir = os.getcwd()
labeled_images_path = f'{current_dir}/invoices_labeling.json'
images_path = f'{current_dir}/img'

print(f"Current directory: {current_dir}")


Current directory: c:\Users\TMesa\OneDrive\IE University\Y3\S2\Statistical Learning and Prediction\Project


In [5]:
f = open(labeled_images_path)
label_studio_data = json.load(f)

## Define Custom Functinos


In [6]:
def calculate_iou(box_1, box_2):
    poly_1 = Polygon(box_1)
    poly_2 = Polygon(box_2)
    iou = poly_1.intersection(poly_2).area
    min_area = min(poly_1.area,poly_2.area)
    return iou/min_area


def hocr_to_dataframe(fp):
    doc = etree.parse(fp)
    words = []
    wordConf = []
    coords_list = []
    for path in doc.xpath('//*'):
        if 'ocrx_word' in path.values():
            coord_text = path.values()[2].split(';')[0].split(' ')[1:]
            word_coord = list(map(int, coord_text)) #x1, y1, x2, y2
            conf = [x for x in path.values() if 'x_wconf' in x][0]
            wordConf.append(int(conf.split('x_wconf ')[1]))
            words.append(path.text)
            coords_list.append(word_coord)

    dfReturn = pd.DataFrame({'word' : words,
                             'coords': coords_list,
                             'confidence' : wordConf})
    return(dfReturn)

## Process JSON & create train and test files

In [7]:
document_data = dict()
document_data['file_name'] = []
document_data['labelled_bbox']= []

for i in range(len(label_studio_data)):
    row = label_studio_data[i]
    file_name = os.path.basename(row['data']['image'])
    label_list, labels, bboxes = [], [], []

    for label_ in row['annotations'][0]['result']:
        label_value = label_['value']
        x, y, w, h = label_value['x'], label_value['y'], label_value['width'], label_value['height']
        original_w , original_h = label_['original_width'], label_['original_height']

        x1 = int((x * original_w) / 100)
        y1 = int((y * original_h) / 100)
        x2 = x1 + int(original_w*w / 100)
        y2 = y1 + int(original_h*h / 100)

        label = label_value['rectanglelabels']
        label_list.append((label, (x1,y1,x2,y2), original_h, original_w))

    document_data['file_name'].append(file_name)
    document_data['labelled_bbox'].append(label_list)

custom_dataset = pd.DataFrame(document_data)
print(custom_dataset)

                     file_name  \
0       51640635-invoice_0.jpg   
1       f27e6c2a-invoice_1.jpg   
2       e0e5f116-invoice_2.jpg   
3       ece38d84-invoice_3.jpg   
4       6d553669-invoice_4.jpg   
..                         ...   
121  2f3434d3-invoice_1261.jpg   
122   7ed3cff9-invoice_128.jpg   
123   9e2e6044-invoice_129.jpg   
124   c0963b08-invoice_130.jpg   
125   719e48d2-invoice_131.jpg   

                                         labelled_bbox  
0    [([Invoice number], (125, 64, 541, 109), 2339,...  
1    [([Invoice number], (112, 59, 553, 118), 2339,...  
2    [([Invoice number], (125, 62, 540, 114), 2339,...  
3    [([Invoice number], (118, 65, 552, 127), 2339,...  
4    [([Invoice number], (112, 56, 556, 128), 2339,...  
..                                                 ...  
121  [([Invoice number], (27, 64, 156, 87), 820, 63...  
122  [([Invoice number], (28, 195, 223, 215), 904, ...  
123  [([Invoice number], (369, 85, 534, 100), 800, ...  
124  [([Invoice numbe

In [8]:
#defined label to ID pipeline
label2id = {"Invoice number": 0, "Invoice date": 1, "Due date": 2,
            "Issuer name": 3, "Recipient name": 4, "Total amount": 5}

In [9]:
#write our class_list.txt file for later use
shared_folder = current_dir
labels_sorted_by_id = sorted(label2id, key=label2id.get)
labels_line = ",".join(labels_sorted_by_id)
class_list_path = os.path.join(shared_folder, "class_list.txt")
with open(class_list_path, "w") as f:
    f.write(labels_line)

In [10]:
def clean_filename(file_name):
    #clean up, since label studio outputs a random 8 digit string before the image name
    return re.sub(r'^[0-9a-fA-F]{8}-', '', file_name)
custom_dataset['file_name'] = custom_dataset['file_name'].apply(clean_filename)

In [11]:
# Set the new output directory in the shared drive folder
output_folder = "/layoutlmv3_hocr_output"
os.makedirs(output_folder, exist_ok=True)


In [12]:
pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'  # Update the path as needed

In [13]:
final_list = []
all_files = glob.glob(f'{current_dir}/img/*.jpg')

for i in tqdm(custom_dataset.iterrows(), total=custom_dataset.shape[0]):
    custom_label_text = {}
    word_list = []
    ner_tags_list  = []
    bboxes_list = []

    file_name = i[1]['file_name']
    for image in all_files:
        frame_file_name = os.path.basename(image)
        if frame_file_name == file_name:
            custom_label_text['id'] = i[0]
            image_basename = os.path.basename(image)
            custom_label_text['file_name'] = image_basename
            annotations = []
            label_coord_list = i[1]['labelled_bbox']
            for label_coord in label_coord_list:
                (x1,y1,x2,y2) = label_coord[1]
                box1 = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
                label = label_coord[0][0]
                base_name = os.path.join(output_folder, os.path.basename(image).split('.')[0])
                try:
                    pytesseract.run_tesseract(image, base_name, extension='box', lang=None, config="hocr")
                except TesseractError as e:
                    print(f"Error processing image: {image}, error: {e}")
                    # Skip processing if Tesseract fails
                    continue
                hocr_file = os.path.join(base_name+'.hocr')
                # Check if the HOCR file is empty
                if os.stat(hocr_file).st_size == 0:
                    print(f"Warning: Empty HOCR file for image: {image}")
                    continue  # Skip processing if HOCR file is empty
                hocr_df = hocr_to_dataframe(hocr_file)
                for word in hocr_df.iterrows():
                    coords = word[1]['coords']
                    (x1df,y1df,x2df,y2df) = coords
                    box2 = [[x1df, y1df], [x2df, y1df], [x2df, y2df], [x1df, y2df]]
                    words = word[1]['word']
                    overlap_perc = calculate_iou(box1,box2)
                    temp_dic = {}
                    if overlap_perc > 0.80:
                        if words != '-':
                            word_list.append(words)
                            bboxes_list.append(coords)
                            label_id = label2id[label]
                            ner_tags_list.append(label_id)

                        custom_label_text['tokens'] = word_list
                        custom_label_text['bboxes'] = bboxes_list
                        custom_label_text['ner_tags'] = ner_tags_list

    final_list.append(custom_label_text)

 56%|█████▋    | 71/126 [04:51<02:41,  2.94s/it]

Error processing image: c:\Users\TMesa\OneDrive\IE University\Y3\S2\Statistical Learning and Prediction\Project/img\invoice_75.jpg, error: (1, 'Error during processing.')
Error processing image: c:\Users\TMesa\OneDrive\IE University\Y3\S2\Statistical Learning and Prediction\Project/img\invoice_75.jpg, error: (1, 'Error during processing.')


 57%|█████▋    | 72/126 [04:51<01:59,  2.21s/it]

Error processing image: c:\Users\TMesa\OneDrive\IE University\Y3\S2\Statistical Learning and Prediction\Project/img\invoice_75.jpg, error: (1, 'Error during processing.')
Error processing image: c:\Users\TMesa\OneDrive\IE University\Y3\S2\Statistical Learning and Prediction\Project/img\invoice_75.jpg, error: (1, 'Error during processing.')


100%|██████████| 126/126 [06:40<00:00,  3.18s/it]


In [15]:
train, test = train_test_split(final_list, random_state=21, test_size=0.3)

# Define file paths
final_list_path = os.path.join(shared_folder, "final_list_text.txt")
train_path = os.path.join(shared_folder, "train.txt")
test_path = os.path.join(shared_folder, "test.txt")

# Save final_list
with open(final_list_path, 'w') as f:
    for detail in final_list:
        f.write(str(detail) + "\n")

# Save train split
with open(train_path, 'w') as f:
    for detail in train:
        f.write(str(detail) + "\n")

# Save test split
with open(test_path, 'w') as f:
    for detail in test:
        f.write(str(detail) + "\n")

## Prepartions for running the model

This code will output a file named layoutlmv3.py, a custom dataset script designed for use with Hugging Face's Datasets library.

It defines a custom dataset class that configures and loads invoice data for training, including methods for reading image files, processing OCR and annotation data, normalizing bounding boxes, and generating structured examples with tokens, bounding boxes, and labels for model training and testing.

In [16]:
import os

code_content = '''
import json
import os
import ast
from pathlib import Path
import datasets
import PIL
from PIL import Image
import pandas as pd

def load_image(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        w, h = image.size
        return image, (w, h)
    except (PIL.UnidentifiedImageError, OSError) as e:
        print(f"Warning: Skipping image {image_path} due to error: {e}")
        return None, None  # Return None to indicate skipping

def normalize_bbox(bbox, size):
    return [
        int(1000 * bbox[0] / size[0]),
        int(1000 * bbox[1] / size[1]),
        int(1000 * bbox[2] / size[0]),
        int(1000 * bbox[3] / size[1]),
    ]

_URLS = []

# Get the current directory
data_path = os.getcwd()

class DatasetConfig(datasets.BuilderConfig):
    """BuilderConfig for InvoiceExtraction Dataset"""
    def __init__(self, **kwargs):
        """BuilderConfig for InvoiceExtraction Dataset.
        Args:
          **kwargs: keyword arguments forwarded to super.
        """
        super(DatasetConfig, self).__init__(**kwargs)

class InvoiceExtraction(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        DatasetConfig(name="InvoiceExtraction", version=datasets.Version("1.0.0"), description="InvoiceExtraction dataset"),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names = ["Invoice number", "Invoice date", "Due date", "Issuer name", "Recipient name", "Total amount"]
                        )
                    ),
                    "image_path": datasets.Value("string"),
                    "image": datasets.features.Image()
                }
            ),
            supervised_keys=None,
            homepage="",
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        """Uses local files located with data_dir"""
        dest = data_path

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN, gen_kwargs={"filepath": os.path.join(dest, "train.txt"), "dest": dest}
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST, gen_kwargs={"filepath": os.path.join(dest, "test.txt"), "dest": dest}
            ),
        ]

    def _generate_examples(self, filepath, dest):
        if not os.path.exists(filepath):
            print(f"Warning: {filepath} does not exist!")
            return

        with open(os.path.join(dest, "class_list.txt"), "r") as f:
            labels = f.read().strip().split(",")
            id2label = {i: label for i, label in enumerate(labels)}

        item_list = []
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                item_list.append(line.rstrip("\\n\\r"))

        for guid, line in enumerate(item_list):
            try:
                data = ast.literal_eval(line)
                image_path = os.path.join(dest, "img", data["file_name"])

                if not os.path.exists(image_path):
                    print(f"Warning: Image {image_path} does not exist!")
                    continue

                image, size = load_image(image_path)
                if image is None:
                    continue

                boxes = data["bboxes"]
                text = data["tokens"]
                label = data["ner_tags"]

                boxes = [normalize_bbox(box, size) for box in boxes]

                yield guid, {
                    "id": str(guid),
                    "tokens": text,
                    "bboxes": boxes,
                    "ner_tags": label,
                    "image_path": image_path,
                    "image": image
                }
            except Exception as e:
                print(f"Error processing item {guid}: {e}")
                continue
'''

# Save to current directory
with open('layoutlmv3.py', 'w') as f:
    f.write(code_content)

print(f"File saved successfully at: {os.path.join(os.getcwd(), 'layoutlmv3.py')}")

File saved successfully at: c:\Users\TMesa\OneDrive\IE University\Y3\S2\Statistical Learning and Prediction\Project\layoutlmv3.py


In [19]:
# Install the Hugging Face Transformers library from GitHub, plus datasets, seqeval, and accelerate
#!pip install -q git+https://github.com/huggingface/transformers.git
#!pip install -q datasets seqeval
#!pip install -q accelerate

In [19]:
from datasets import load_dataset
script_path = os.path.join(current_dir, "layoutlmv3.py")

# Load the dataset using the full path
dataset = load_dataset(script_path, trust_remote_code=True)
print(dataset)

  df = pd.read_csv(os.path.join(dest, 'class_list.txt'), delimiter='\s', header=None)


Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


["{'id': 67, 'file_name': 'invoice_71.jpg', 'tokens': ['BILLTO', 'John', 'Smith', 'INVOICE', 'DATE', '1110212019', 'DUE', 'DATE', '26/02/2019', 'TOTAL', '$154.06', 'INVOICE', '#', 'us-001', 'East', 'Repair', 'Inc.'], 'bboxes': [[55, 209, 98, 224], [55, 237, 81, 246], [86, 236, 117, 246], [463, 235, 509, 250], [513, 235, 543, 250], [634, 240, 696, 250], [463, 287, 485, 302], [490, 287, 520, 302], [633, 292, 696, 302], [481, 564, 522, 580], [628, 563, 685, 581], [463, 209, 509, 224], [513, 209, 521, 224], [654, 214, 694, 224], [55, 130, 81, 140], [85, 130, 124, 143], [128, 130, 148, 140]], 'ner_tags': [4, 4, 4, 1, 1, 1, 2, 2, 2, 5, 5, 0, 0, 0, 3, 3, 3]}", "{'id': 27, 'file_name': 'invoice_28.jpg', 'tokens': ['Invoice', 'no:', '89174655', 'Date', 'of', 'issue:', '0', '1/19/2016', 'Seller:', 'Hayden-Young', 'Client:', 'Guerrero', 'Group', 'Total', '$', '101,19', '$', '10,12', '$', '111,31'], 'bboxes': [[136, 77, 266, 103], [283, 83, 336, 103], [353, 77, 534, 103], [136, 146, 205, 168], [21



[[63, 658, 120, 671], [129, 657, 150, 670], [63, 682, 184, 706], [380, 657, 423, 671], [430, 657, 465, 671], [382, 682, 419, 706], [431, 683, 454, 705], [468, 682, 536, 706], [696, 657, 728, 671], [736, 657, 771, 671], [696, 682, 736, 706], [748, 683, 771, 705], [784, 682, 852, 706], [59, 386, 89, 401], [96, 387, 199, 405], [61, 437, 102, 452], [108, 437, 134, 452], [60, 467, 100, 482], [105, 468, 150, 482], [996, 657, 1036, 670], [1044, 657, 1073, 670], [1082, 657, 1132, 672], [997, 683, 1011, 705], [1024, 683, 1117, 708], [1129, 681, 1144, 708]]
[63, 658, 120, 671]
[129, 657, 150, 670]
[63, 682, 184, 706]
[380, 657, 423, 671]
[430, 657, 465, 671]
[382, 682, 419, 706]
[431, 683, 454, 705]
[468, 682, 536, 706]
[696, 657, 728, 671]
[736, 657, 771, 671]
[696, 682, 736, 706]
[748, 683, 771, 705]
[784, 682, 852, 706]
[59, 386, 89, 401]
[96, 387, 199, 405]
[61, 437, 102, 452]
[108, 437, 134, 452]
[60, 467, 100, 482]
[105, 468, 150, 482]
[996, 657, 1036, 670]
[1044, 657, 1073, 670]
[1082, 65

Generating test split: 0 examples [00:00, ? examples/s]

["{'id': 19, 'file_name': 'invoice_19.jpg', 'tokens': ['Invoice', 'no:', '61660848', 'Date', 'of', 'issue:', '05/30/2019', 'Seller:', 'Gardner,', 'Tucker', 'and', 'Martinez', 'Client:', 'Robbins', 'Group', 'Total', '$', '24,12', '$', '2,41', '$', '26,53'], 'bboxes': [[136, 77, 266, 103], [283, 83, 336, 103], [353, 77, 534, 103], [136, 146, 205, 168], [218, 145, 246, 168], [258, 145, 341, 168], [805, 145, 976, 171], [135, 447, 252, 472], [143, 510, 261, 534], [273, 510, 368, 531], [378, 510, 426, 531], [440, 510, 557, 531], [828, 447, 946, 472], [837, 510, 945, 531], [956, 510, 1038, 537], [503, 1287, 565, 1304], [963, 1287, 975, 1307], [986, 1287, 1053, 1307], [1164, 1287, 1176, 1307], [1187, 1287, 1238, 1307], [1412, 1287, 1424, 1307], [1435, 1287, 1502, 1307]], 'ner_tags': [0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5]}", "{'id': 42, 'file_name': 'invoice_43.jpg', 'tokens': ['Invoice', 'no:', '57338979', 'Date', 'of', 'issue:', '05/10/2021', 'Seller:', 'Estrada', 

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


{'id': 42, 'file_name': 'invoice_43.jpg', 'tokens': ['Invoice', 'no:', '57338979', 'Date', 'of', 'issue:', '05/10/2021', 'Seller:', 'Estrada', 'and', 'Sons', 'Client:', 'Santana-Sherman', 'Total', '$', '13', '717,81', '$1', '371,78', '$', '15', '089,59'], 'bboxes': [[136, 77, 266, 103], [283, 83, 336, 103], [354, 77, 534, 103], [136, 146, 205, 168], [218, 145, 246, 168], [258, 145, 341, 168], [805, 145, 976, 171], [135, 447, 252, 472], [144, 510, 245, 531], [258, 510, 306, 531], [319, 510, 383, 531], [828, 447, 946, 472], [836, 510, 1083, 531], [503, 1402, 565, 1419], [909, 1402, 921, 1422], [933, 1402, 960, 1419], [971, 1402, 1053, 1422], [1110, 1402, 1145, 1422], [1156, 1402, 1239, 1422], [1358, 1402, 1370, 1422], [1382, 1403, 1409, 1419], [1419, 1402, 1502, 1422]], 'ner_tags': [0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5]}
[[136, 77, 266, 103], [283, 83, 336, 103], [354, 77, 534, 103], [136, 146, 205, 168], [218, 145, 246, 168], [258, 145, 341, 168], [805, 145, 

# Run the model

## Model set up

In [20]:
from transformers import AutoProcessor

# we'll use the Auto API here - it will load LayoutLMv3Processor behind the scenes,
# based on the checkpoint we provide from the hub
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

Using TensorFlow backend.


preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Some helper functions:

In [21]:
return_entity_level_metrics = False

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
# unique labels.
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list


def unnormalize_box(bbox, width, height):
     return [
         width * (bbox[0] / 1000),
         height * (bbox[1] / 1000),
         width * (bbox[2] / 1000),
         height * (bbox[3] / 1000),
     ]


# Tokenize the inputs
def prepare_examples(examples):
    images = examples[image_column_name]
    words = examples[text_column_name]
    boxes = examples[boxes_column_name]
    word_labels = examples[label_column_name]

    encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
                       truncation=True, padding="max_length")

    return encoding


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }


if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
    # No need to convert the labels since they are already ints.
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
else:
    label_list = get_label_list(dataset["train"][label_column_name])
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
num_labels = len(label_list)

Tokenize train and test datasets

In [22]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

# we need to define custom features for `set_format` (used later on) to work properly
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

train_dataset = dataset["train"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)
eval_dataset = dataset["test"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Load seqeval metric

In [23]:
from evaluate import load

metric = load("seqeval")


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [24]:
from transformers import LayoutLMv3ForTokenClassification, TrainingArguments, Trainer

# Load the pre-trained model with your label mappings
model = LayoutLMv3ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv3-base",
    id2label=id2label,
    label2id=label2id
)




model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
import transformers
print("Transformers version:", transformers.__version__)
print("Transformers location:", transformers.__file__)


Transformers version: 4.52.0.dev0
Transformers location: c:\Users\TMesa\miniforge3\Lib\site-packages\transformers\__init__.py


In [32]:
from transformers import TrainingArguments, Trainer

train_limit = 1000
save_interval = train_limit/5

#come back to this
training_args = TrainingArguments(
    output_dir="/checkpoints", #output directory where the best model will be saved
    num_train_epochs=5,
    max_steps=train_limit,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-5,
    save_steps=save_interval,
    logging_steps=save_interval,
)


## Train the model

In [33]:
from transformers.data.data_collator import default_data_collator

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [31]:
trainer.train()

Step,Training Loss
100,0.1848
200,0.0285
300,0.0109
400,0.0066
500,0.0025
600,0.001
700,0.0005
800,0.0007
900,0.0004
1000,0.0004




TrainOutput(global_step=1000, training_loss=0.023626066129654646, metrics={'train_runtime': 3874.9677, 'train_samples_per_second': 1.032, 'train_steps_per_second': 0.258, 'total_flos': 1018825521039360.0, 'train_loss': 0.023626066129654646, 'epoch': 45.45454545454545})

Script automatically logs metrics to wandb

API key to run: e9f74b7d71ba2474bb183a627a10a5a4a511bd09

## Evaluate the model

In [34]:
trainer.evaluate()





{'eval_loss': 0.28674784302711487,
 'eval_model_preparation_time': 0.003,
 'eval_precision': 0.9314285714285714,
 'eval_recall': 0.9532163742690059,
 'eval_f1': 0.9421965317919074,
 'eval_accuracy': 0.9585185185185185,
 'eval_runtime': 6.6372,
 'eval_samples_per_second': 5.725,
 'eval_steps_per_second': 1.507}

# Use the model (inference)

Steps:

1) Load the Processor and Fine-Tuned Model.

2) Prepare the Input and Run the Model

3) Output the image's key features

NOTE: This pipeline can be easily adapted to incorporate the model classifier as well.

In [None]:
#!pip install --upgrade --force-reinstall transformers==4.29.2

In [None]:
#!rm -rf ~/.cache/huggingface/transformers

In [56]:
# Load an example invoice image
image_path = f"{images_path}/invoice_100.jpg"
image = Image.open(image_path).convert("RGB")

In [57]:
def find_latest_checkpoint():
    checkpoint_dir = "/checkpoints"
    if not os.path.exists(checkpoint_dir):
        print(f"No checkpoints directory found at {checkpoint_dir}")
        return None

    checkpoints = []
    for item in os.listdir(checkpoint_dir):
        # Look for directories matching the pattern "checkpoint-<number>"
        match = re.match(r"checkpoint-(\d+)", item)
        if match and os.path.isdir(os.path.join(checkpoint_dir, item)):
            number = int(match.group(1))
            checkpoints.append(number)

    if not checkpoints:
        print("No checkpoints found")
        return None

    latest_number = max(checkpoints)
    latest_checkpoint = f"checkpoint-{latest_number}"
    print(f"Latest checkpoint: {latest_checkpoint}")
    return os.path.join(checkpoint_dir, latest_checkpoint)

# Find and print the latest checkpoint
latest_checkpoint_path = find_latest_checkpoint()

Latest checkpoint: checkpoint-1000


In [58]:
from transformers import AutoModelForTokenClassification, LayoutLMv3Processor, LayoutLMv3Tokenizer
import torch

model = AutoModelForTokenClassification.from_pretrained(f"{latest_checkpoint_path}") #file with the trained weights

In [59]:
#Run tesseract to generate an HOCR file
output_folder = "/layoutlmv3_hocr_output"
os.makedirs(output_folder, exist_ok=True)

image_basename = os.path.basename(image_path).split('.')[0]
base_name = os.path.join(output_folder, image_basename)

# Run Tesseract to generate a HOCR file
pytesseract.run_tesseract(image_path, base_name, extension='box', lang=None, config="hocr")
hocr_file = base_name + '.hocr'

In [60]:
hocr_df = hocr_to_dataframe(hocr_file)
words = hocr_df['word'].tolist()   # List of tokens
boxes = hocr_df['coords'].tolist()   # List of bounding boxes


In [61]:
print(words)

['ie)', 'BRAND', 'NAME', 'Latika', 'Sudiati', '(062127', '85090008', 'WLLOS#', 'ADIOANI', 'C', '>', 'a', '-', '.', "'", 'sonra', '108', 'a', 'r', 'Lorem', 'ipsum.', '$150', 'o', '$180', 'o', 'seer;', 'oes', 'is', 'aga', 'N', 'TOTAL', '$990', '°', 'N', 'ul', 'Tipu', 'RAHMATNAIMUDIN', 'Payment', 'Method', 'Term', 'and', 'Condition', 'Contact', 'Account', 'US:', '01255250', '‘adipiscing', 'na', 'elit', 'sed', 'diam', 'nonummy', 'ANC', 'Name:', 'Lorem', 'Ipsum', 'brandname@maitcom']


In [62]:
print(boxes)

[[115, 48, 179, 112], [195, 73, 256, 84], [262, 73, 313, 84], [120, 200, 172, 213], [178, 200, 238, 213], [120, 226, 137, 233], [143, 226, 197, 233], [616, 223, 636, 338], [616, 50, 636, 185], [599, 2, 654, 35], [47, 457, 67, 478], [268, 468, 325, 475], [435, 469, 452, 476], [526, 468, 534, 474], [47, 483, 67, 522], [268, 501, 325, 508], [429, 501, 452, 509], [526, 500, 534, 507], [47, 527, 67, 553], [272, 534, 283, 538], [289, 533, 325, 540], [432, 532, 452, 539], [526, 532, 531, 538], [595, 532, 614, 539], [47, 566, 67, 583], [473, 564, 529, 571], [593, 563, 616, 572], [43, 566, 67, 607], [472, 590, 521, 598], [47, 626, 67, 642], [472, 622, 513, 631], [588, 617, 616, 638], [47, 646, 67, 664], [47, 668, 67, 684], [47, 688, 67, 704], [487, 708, 619, 748], [527, 754, 619, 761], [54, 835, 114, 847], [118, 834, 170, 844], [236, 835, 270, 844], [274, 834, 299, 844], [303, 833, 370, 844], [511, 835, 564, 844], [55, 885, 67, 891], [100, 888, 104, 891], [111, 885, 153, 891], [236, 880, 310, 8

In [63]:
dummy_word_labels = [0] * len(words)
print(dummy_word_labels)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [64]:
import torch
from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3Processor

# Label mapping (same as used in training)
id2label = {
    0: "Invoice number",
    1: "Invoice date",
    2: "Due date",
    3: "Issuer name",
    4: "Recipient name",
    5: "Total amount"
}

model_path = latest_checkpoint_path
model = LayoutLMv3ForTokenClassification.from_pretrained(model_path)
processor = LayoutLMv3Processor.from_pretrained(model_path)

width, height = image.size  # Get image width and height
normalized_boxes = [[int(1000 * (x / width)), int(1000 * (y / height)),
                     int(1000 * (x2 / width)), int(1000 * (y2 / height))]
                    for x, y, x2, y2 in boxes]  # Normalize boxes

# Prepare the inputs for the model
encoding = processor(image, words, boxes=normalized_boxes, return_tensors="pt")  # Use normalized_boxes

# Run inference
model.eval()
with torch.no_grad():
    outputs = model(**encoding)

# Get predicted label IDs for each token and map them to label names
logits = outputs.logits  # shape: (batch_size, sequence_length, num_labels)
predicted_ids = torch.argmax(logits, dim=-1).squeeze().tolist()

# Ensure predicted_ids is a list even for a single instance
if isinstance(predicted_ids, int):
    predicted_ids = [predicted_ids]

predicted_labels = [id2label.get(pred, "O") for pred in predicted_ids]

# Aggregate tokens by their predicted label
aggregated_fields = {}
for token, label in zip(words, predicted_labels):
    if label != "O":  # Skip tokens with no assigned field
        if label in aggregated_fields:
            aggregated_fields[label] += " " + token
        else:
            aggregated_fields[label] = token

# Output the aggregated result for each invoice field
for field in id2label.values():
    value = aggregated_fields.get(field, "")
    print(f"{field}: {value}")

Invoice number: a - . ' sonra 108 a r Lorem ipsum. $150 o
Invoice date: 
Due date: 
Issuer name: BRAND NAME Latika Sudiati (062127 $180 o seer; oes ° N Payment Method Term and Condition Contact elit Lorem
Recipient name: ie) 85090008 WLLOS# ADIOANI C > is aga $990 RAHMATNAIMUDIN
Total amount: N TOTAL ul Tipu Account US: 01255250 ‘adipiscing na sed diam nonummy ANC Name: Ipsum brandname@maitcom


In [65]:
LayoutLMv3Tokenizer.added_tokens_encoder = property(lambda self: {})

In [66]:
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", use_fast=False)

In [67]:
# Load the processor with OCR enabled
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", tokenizer=tokenizer, apply_ocr=False)

In [68]:
# Get image width and height
width, height = image.size

# Normalize the bounding boxes to the range 0-1000
normalized_boxes = [[int(1000 * (x / width)), int(1000 * (y / height)),
                     int(1000 * (x2 / width)), int(1000 * (y2 / height))]
                    for x, y, x2, y2 in boxes]

# Process image
#encoding = processor([image], [words], [boxes], [dummy_word_labels], return_tensors="pt", is_split_into_words=True)
encoding = processor(image, words, boxes=normalized_boxes, word_labels=dummy_word_labels, return_tensors="pt", is_split_into_words=True)


print(processor.tokenizer.convert_ids_to_tokens(encoding.input_ids[0]))
print(encoding.bbox[0])

# Run inference
with torch.no_grad():
    outputs = model(**encoding)

logits = outputs.logits
predictions = logits.argmax(-1).squeeze().tolist()

# Convert token IDs to readable tokens
tokens = processor.tokenizer.convert_ids_to_tokens(encoding.input_ids[0])

# Label mapping (same as used in training)
id2label = {
    0: "Invoice number",
    1: "Invoice date",
    2: "Due date",
    3: "Issuer name",
    4: "Recipient name",
    5: "Total amount"
}

labels = [id2label[pred] for pred in predictions]

# Group tokens by entity type
extracted_info = {}
for token, label in zip(tokens, labels):
    if label != "O":
        extracted_info.setdefault(label, []).append(token)

['<s>', 'Ġie', ')', 'ĠBR', 'AND', 'ĠNAME', 'ĠLat', 'ika', 'ĠSud', 'iat', 'i', 'Ġ(', '06', '2', '127', 'Ġ8', '509', '000', '8', 'ĠW', 'LL', 'OS', '#', 'ĠAD', 'IO', 'ANI', 'ĠC', 'Ġ>', 'Ġa', 'Ġ-', 'Ġ.', "Ġ'", 'Ġson', 'ra', 'Ġ108', 'Ġa', 'Ġr', 'ĠLore', 'm', 'Ġ', 'ips', 'um', '.', 'Ġ$', '150', 'Ġo', 'Ġ$', '180', 'Ġo', 'Ġse', 'er', ';', 'Ġo', 'es', 'Ġis', 'Ġag', 'a', 'ĠN', 'ĠTOTAL', 'Ġ$', '990', 'ĠÂ°', 'ĠN', 'Ġul', 'ĠTip', 'u', 'ĠRA', 'HM', 'AT', 'NA', 'IM', 'UD', 'IN', 'ĠPayment', 'ĠMethod', 'ĠTerm', 'Ġand', 'ĠCondition', 'ĠContact', 'ĠAccount', 'ĠUS', ':', 'Ġ01', '25', '52', '50', 'ĠâĢ', 'ĺ', 'ad', 'ip', 'is', 'cing', 'Ġna', 'Ġel', 'it', 'Ġsed', 'Ġdiam', 'Ġnon', 'ummy', 'ĠANC', 'ĠName', ':', 'ĠLore', 'm', 'ĠIps', 'um', 'Ġbrand', 'name', '@', 'ma', 'it', 'com', '</s>']
tensor([[  0,   0,   0,   0],
        [169,  50, 264, 118],
        [169,  50, 264, 118],
        [287,  77, 377,  88],
        [287,  77, 377,  88],
        [386,  77, 461,  88],
        [176, 210, 253, 224],
        [176, 2

In [69]:
# Display result
print("Structured Information:")
for key, token_list in extracted_info.items():
    clean_tokens = [token.replace("Ġ", "") for token in token_list if token not in ["<s>", "</s>"]]
    print(f"{key}: {' '.join(clean_tokens)}")

Structured Information:
Recipient name: Lat ika Sud iat i > a ' r Â° âĢ ĺ ad ip is cing na el it
Issuer name: ie ) BR AND NAME AD IO ANI C son ra Lore m  ips um . o is N N ul Term and Condition Contact sed diam non ummy ANC Name : Lore m Ips um brand name @ ma it com
Invoice number: ( 06 2 127 8 509 000 8 W LL OS # Payment Method Account US : 01 25 52 50
Total amount: - . 108 a $ 150 o $ 180 se er ; o es ag a TOTAL $ 990 Tip u RA HM AT NA IM UD IN


In [70]:
import re
import datetime
import pandas as pd

# Assuming 'extracted_info' contains the structured information from LayoutLMv3

# 1. Post-processing with Regular Expressions and Data Structuring
structured_data = {}
for key, token_list in extracted_info.items():
    clean_tokens = [token.replace("Ġ", "") for token in token_list if token not in ["<s>", "</s>"]]

    # Apply field-specific cleaning (e.g., for invoice number, date)
    if key == "Invoice number":
        structured_data[key] = re.sub(r"[^0-9]", "", " ".join(clean_tokens))
    elif key == "Invoice date":
        date_match = re.search(r"(\d{2})/(\d{2})/(\d{4})", " ".join(clean_tokens))
        if date_match:
            structured_data[key] = "/".join(date_match.groups())
    # ... Add similar logic for other fields
    else:
        structured_data[key] = " ".join(clean_tokens)


# 2. Data Validation and Type Conversion
try:
    structured_data["Invoice date"] = datetime.datetime.strptime(structured_data["Invoice date"], "%d/%m/%Y").date()
except (ValueError, KeyError):
    print("Invalid or missing Invoice Date format")

try:
    structured_data["Total amount"] = float(re.sub(r"[^0-9.]", "", structured_data["Total amount"]))
except (ValueError, KeyError):
    print("Invalid or missing Total Amount format")
# ... (Apply similar logic for other fields)


# 3. Create Pandas DataFrame
df = pd.DataFrame([structured_data])

# 4. Custom Logic and Domain Knowledge
# ... (Add custom logic as needed)

# Now you have 'df' containing the structured and cleaned data
display(df)


Invalid or missing Invoice Date format


Unnamed: 0,Recipient name,Issuer name,Invoice number,Total amount
0,Lat ika Sud iat i > a ' r Â° âĢ ĺ ad ip is cin...,ie ) BR AND NAME AD IO ANI C son ra Lore m ip...,0621278509000801255250,0.10815
