Background: Read this [article](https://medium.com/@shivarama/layoutlmv3-from-zero-to-hero-part-1-85d05818eec4) and the [HuggingFace documentation](https://huggingface.co/docs/transformers/en/model_doc/layoutlmv3) to learn about our Layoutlmv3 model architecture

Here is the link to the shared project folder: https://drive.google.com/drive/folders/12W7B73S5dwu9HEEsS0iJ6wu9maHXYLdg

Please add the link as a folder in /content/drive/MyDrive/

# Setup

## Imports

In [1]:
# Create requirements.txt file
with open('requirements.txt', 'w') as f:
    f.write('''pandas
numpy
matplotlib
scikit-learn
opencv-python
pytesseract
lxml
shapely
Pillow
pdf2image
tqdm
seaborn
jupyter
ipykernel
evaluate
git+https://github.com/huggingface/transformers.git
datasets
seqeval
accelerate
''')

# Install dependencies using pip
#%pip install -q -r requirements.txt

In [2]:
#!pip install -q pandas numpy tqdm shapely pillow pytesseract lxml scikit-learn datasets evaluate

In [3]:
#!sudo apt-get update
#!sudo apt-get install tesseract-ocr

In [4]:
import os
import json
import glob
import ast
import re
import pandas as pd
import numpy as np
import datasets
from tqdm import tqdm
from lxml import etree
from PIL import Image
from pathlib import Path
from pytesseract import pytesseract
from pytesseract import TesseractError
from shapely.geometry import Polygon
from datasets.features import ClassLabel
from sklearn.model_selection import train_test_split

In [5]:
pytesseract.tesseract_cmd = '/usr/bin/tesseract'

## Load in Data

In [6]:
#from google.colab import drive
#drive.mount('/content/drive')

In [7]:
#labeled_images_path = '/content/drive/MyDrive/SharedStatsProject/invoices_labeling.json'
#images_path = '/content/drive/MyDrive/SharedStatsProject/img'

current_dir = os.getcwd()
labeled_images_path = f'{current_dir}/invoices_labeling_50.json'
images_path = f'{current_dir}/img_identical'

print(f"Current directory: {current_dir}")


Current directory: c:\Users\TMesa\OneDrive\IE University\Y3\S2\Statistical Learning and Prediction\Project


In [8]:
f = open(labeled_images_path)
label_studio_data = json.load(f)

## Define Custom Functinos


In [9]:
def calculate_iou(box_1, box_2):
    poly_1 = Polygon(box_1)
    poly_2 = Polygon(box_2)
    iou = poly_1.intersection(poly_2).area
    min_area = min(poly_1.area,poly_2.area)
    return iou/min_area


def hocr_to_dataframe(fp):
    doc = etree.parse(fp)
    words = []
    wordConf = []
    coords_list = []
    for path in doc.xpath('//*'):
        if 'ocrx_word' in path.values():
            coord_text = path.values()[2].split(';')[0].split(' ')[1:]
            word_coord = list(map(int, coord_text)) #x1, y1, x2, y2
            conf = [x for x in path.values() if 'x_wconf' in x][0]
            wordConf.append(int(conf.split('x_wconf ')[1]))
            words.append(path.text)
            coords_list.append(word_coord)

    dfReturn = pd.DataFrame({'word' : words,
                             'coords': coords_list,
                             'confidence' : wordConf})
    return(dfReturn)

## Process JSON & create train and test files

In [10]:
document_data = dict()
document_data['file_name'] = []
document_data['labelled_bbox']= []

for i in range(len(label_studio_data)):
    row = label_studio_data[i]
    file_name = os.path.basename(row['data']['image'])
    label_list, labels, bboxes = [], [], []

    for label_ in row['annotations'][0]['result']:
        label_value = label_['value']
        x, y, w, h = label_value['x'], label_value['y'], label_value['width'], label_value['height']
        original_w , original_h = label_['original_width'], label_['original_height']

        x1 = int((x * original_w) / 100)
        y1 = int((y * original_h) / 100)
        x2 = x1 + int(original_w*w / 100)
        y2 = y1 + int(original_h*h / 100)

        label = label_value['rectanglelabels']
        label_list.append((label, (x1,y1,x2,y2), original_h, original_w))

    document_data['file_name'].append(file_name)
    document_data['labelled_bbox'].append(label_list)

custom_dataset = pd.DataFrame(document_data)
print(custom_dataset)

                  file_name                                      labelled_bbox
0    51640635-invoice_0.jpg  [([Invoice number], (125, 64, 541, 109), 2339,...
1    f27e6c2a-invoice_1.jpg  [([Invoice number], (112, 59, 553, 118), 2339,...
2    e0e5f116-invoice_2.jpg  [([Invoice number], (125, 62, 540, 114), 2339,...
3    ece38d84-invoice_3.jpg  [([Invoice number], (118, 65, 552, 127), 2339,...
4    6d553669-invoice_4.jpg  [([Invoice number], (112, 56, 556, 128), 2339,...
5    ef20bc56-invoice_5.jpg  [([Invoice number], (106, 61, 548, 118), 2339,...
6    d7447248-invoice_6.jpg  [([Invoice number], (106, 53, 561, 119), 2339,...
7    069900fa-invoice_7.jpg  [([Invoice number], (114, 61, 556, 109), 2339,...
8    6d5132bd-invoice_8.jpg  [([Invoice number], (116, 55, 551, 122), 2339,...
9    938b1af4-invoice_9.jpg  [([Invoice number], (111, 46, 555, 125), 2339,...
10  6784b3c1-invoice_10.jpg  [([Invoice number], (119, 64, 551, 113), 2339,...
11  40607ffe-invoice_11.jpg  [([Invoice number], (12

In [11]:
#defined label to ID pipeline
label2id = {"Invoice number": 0, "Invoice date": 1, "Due date": 2,
            "Issuer name": 3, "Recipient name": 4, "Total amount": 5}

In [12]:
#write our class_list.txt file for later use
shared_folder = current_dir
labels_sorted_by_id = sorted(label2id, key=label2id.get)
labels_line = ",".join(labels_sorted_by_id)
class_list_path = os.path.join(shared_folder, "class_list.txt")
with open(class_list_path, "w") as f:
    f.write(labels_line)

In [13]:
def clean_filename(file_name):
    #clean up, since label studio outputs a random 8 digit string before the image name
    return re.sub(r'^[0-9a-fA-F]{8}-', '', file_name)
custom_dataset['file_name'] = custom_dataset['file_name'].apply(clean_filename)

In [14]:
# Set the new output directory in the shared drive folder
output_folder = "/layoutlmv3_hocr_output"
os.makedirs(output_folder, exist_ok=True)


In [15]:
pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'  # Update the path as needed

In [17]:
final_list = []
all_files = glob.glob(f'{images_path}/*.jpg')

for i in tqdm(custom_dataset.iterrows(), total=custom_dataset.shape[0]):
    custom_label_text = {}
    word_list = []
    ner_tags_list  = []
    bboxes_list = []

    file_name = i[1]['file_name']
    for image in all_files:
        frame_file_name = os.path.basename(image)
        if frame_file_name == file_name:
            custom_label_text['id'] = i[0]
            image_basename = os.path.basename(image)
            custom_label_text['file_name'] = image_basename
            annotations = []
            label_coord_list = i[1]['labelled_bbox']
            for label_coord in label_coord_list:
                (x1,y1,x2,y2) = label_coord[1]
                box1 = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
                label = label_coord[0][0]
                base_name = os.path.join(output_folder, os.path.basename(image).split('.')[0])
                try:
                    pytesseract.run_tesseract(image, base_name, extension='box', lang=None, config="hocr")
                except TesseractError as e:
                    print(f"Error processing image: {image}, error: {e}")
                    # Skip processing if Tesseract fails
                    continue
                hocr_file = os.path.join(base_name+'.hocr')
                # Check if the HOCR file is empty
                if os.stat(hocr_file).st_size == 0:
                    print(f"Warning: Empty HOCR file for image: {image}")
                    continue  # Skip processing if HOCR file is empty
                hocr_df = hocr_to_dataframe(hocr_file)
                for word in hocr_df.iterrows():
                    coords = word[1]['coords']
                    (x1df,y1df,x2df,y2df) = coords
                    box2 = [[x1df, y1df], [x2df, y1df], [x2df, y2df], [x1df, y2df]]
                    words = word[1]['word']
                    overlap_perc = calculate_iou(box1,box2)
                    temp_dic = {}
                    if overlap_perc > 0.80:
                        if words != '-':
                            word_list.append(words)
                            bboxes_list.append(coords)
                            label_id = label2id[label]
                            ner_tags_list.append(label_id)

                        custom_label_text['tokens'] = word_list
                        custom_label_text['bboxes'] = bboxes_list
                        custom_label_text['ner_tags'] = ner_tags_list

    final_list.append(custom_label_text)

100%|██████████| 51/51 [03:42<00:00,  4.37s/it]


In [18]:
train, test = train_test_split(final_list, random_state=21, test_size=0.3)

# Define file paths
final_list_path = os.path.join(shared_folder, "final_list_text.txt")
train_path = os.path.join(shared_folder, "train.txt")
test_path = os.path.join(shared_folder, "test.txt")

# Save final_list
with open(final_list_path, 'w') as f:
    for detail in final_list:
        f.write(str(detail) + "\n")

# Save train split
with open(train_path, 'w') as f:
    for detail in train:
        f.write(str(detail) + "\n")

# Save test split
with open(test_path, 'w') as f:
    for detail in test:
        f.write(str(detail) + "\n")

## Prepartions for running the model

This code will output a file named layoutlmv3.py, a custom dataset script designed for use with Hugging Face's Datasets library.

It defines a custom dataset class that configures and loads invoice data for training, including methods for reading image files, processing OCR and annotation data, normalizing bounding boxes, and generating structured examples with tokens, bounding boxes, and labels for model training and testing.

In [19]:
import os

code_content = '''
import json
import os
import ast
from pathlib import Path
import datasets
import PIL
from PIL import Image
import pandas as pd

def load_image(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        w, h = image.size
        return image, (w, h)
    except (PIL.UnidentifiedImageError, OSError) as e:
        print(f"Warning: Skipping image {image_path} due to error: {e}")
        return None, None  # Return None to indicate skipping

def normalize_bbox(bbox, size):
    return [
        int(1000 * bbox[0] / size[0]),
        int(1000 * bbox[1] / size[1]),
        int(1000 * bbox[2] / size[0]),
        int(1000 * bbox[3] / size[1]),
    ]

_URLS = []

# Get the current directory
data_path = os.getcwd()

class DatasetConfig(datasets.BuilderConfig):
    """BuilderConfig for InvoiceExtraction Dataset"""
    def __init__(self, **kwargs):
        """BuilderConfig for InvoiceExtraction Dataset.
        Args:
          **kwargs: keyword arguments forwarded to super.
        """
        super(DatasetConfig, self).__init__(**kwargs)

class InvoiceExtraction(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        DatasetConfig(name="InvoiceExtraction", version=datasets.Version("1.0.0"), description="InvoiceExtraction dataset"),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            features=datasets.Features(
                {
                    "id": datasets.Value("string"),
                    "tokens": datasets.Sequence(datasets.Value("string")),
                    "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
                    "ner_tags": datasets.Sequence(
                        datasets.features.ClassLabel(
                            names = ["Invoice number", "Invoice date", "Due date", "Issuer name", "Recipient name", "Total amount"]
                        )
                    ),
                    "image_path": datasets.Value("string"),
                    "image": datasets.features.Image()
                }
            ),
            supervised_keys=None,
            homepage="",
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        """Uses local files located with data_dir"""
        dest = data_path

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN, gen_kwargs={"filepath": os.path.join(dest, "train.txt"), "dest": dest}
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST, gen_kwargs={"filepath": os.path.join(dest, "test.txt"), "dest": dest}
            ),
        ]

    def _generate_examples(self, filepath, dest):
        if not os.path.exists(filepath):
            print(f"Warning: {filepath} does not exist!")
            return

        with open(os.path.join(dest, "class_list.txt"), "r") as f:
            labels = f.read().strip().split(",")
            id2label = {i: label for i, label in enumerate(labels)}

        item_list = []
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                item_list.append(line.rstrip("\\n\\r"))

        for guid, line in enumerate(item_list):
            try:
                data = ast.literal_eval(line)
                image_path = os.path.join(dest, "img", data["file_name"])

                if not os.path.exists(image_path):
                    print(f"Warning: Image {image_path} does not exist!")
                    continue

                image, size = load_image(image_path)
                if image is None:
                    continue

                boxes = data["bboxes"]
                text = data["tokens"]
                label = data["ner_tags"]

                boxes = [normalize_bbox(box, size) for box in boxes]

                yield guid, {
                    "id": str(guid),
                    "tokens": text,
                    "bboxes": boxes,
                    "ner_tags": label,
                    "image_path": image_path,
                    "image": image
                }
            except Exception as e:
                print(f"Error processing item {guid}: {e}")
                continue
'''

# Save to current directory
#with open('layoutlmv3.py', 'w') as f:
#    f.write(code_content)
#
#print(f"File saved successfully at: {os.path.join(os.getcwd(), 'layoutlmv3.py')}")

File saved successfully at: c:\Users\TMesa\OneDrive\IE University\Y3\S2\Statistical Learning and Prediction\Project\layoutlmv3.py


In [20]:
# Install the Hugging Face Transformers library from GitHub, plus datasets, seqeval, and accelerate
#!pip install -q git+https://github.com/huggingface/transformers.git
#!pip install -q datasets seqeval
#!pip install -q accelerate

In [25]:
from datasets import load_dataset
script_path = os.path.join(current_dir, "layoutlmv3.py")

# Load the dataset using the full path
dataset = load_dataset(script_path, trust_remote_code=True)
print(dataset)

  df = pd.read_csv(os.path.join(dest, 'class_list.txt'), delimiter='\s', header=None)


Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


["{'id': 0, 'file_name': 'invoice_0.jpg', 'tokens': ['Invoice', 'no:', '40378170', 'Date', 'of', 'issue:', '10/15/2012', 'Client:', 'Jackson,', 'Odonnell', 'and', 'Jackson', 'Seller:', 'Patel,', 'Thompson', 'and', 'Montgomery', 'Total', '$', '7,50', '$', '0,75', '$', '8,25'], 'bboxes': [[136, 77, 266, 103], [283, 83, 336, 103], [352, 77, 535, 103], [136, 146, 205, 168], [218, 145, 246, 168], [258, 145, 341, 168], [806, 145, 975, 171], [828, 447, 946, 472], [833, 510, 946, 537], [959, 510, 1079, 531], [1093, 510, 1141, 531], [1151, 510, 1255, 537], [135, 447, 252, 472], [144, 510, 217, 534], [229, 510, 372, 537], [385, 510, 433, 531], [447, 511, 619, 537], [503, 1287, 565, 1304], [979, 1287, 991, 1307], [1002, 1287, 1054, 1307], [1164, 1287, 1176, 1307], [1186, 1287, 1238, 1307], [1427, 1287, 1440, 1307], [1450, 1287, 1502, 1307]], 'ner_tags': [0, 0, 0, 1, 1, 1, 1, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5]}", "{'id': 13, 'file_name': 'invoice_13.jpg', 'tokens': ['Invoice', 'no:

Generating test split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


["{'id': 7, 'file_name': 'invoice_7.jpg', 'tokens': ['Invoice', 'no:', '54212825', 'Date', 'of', 'issue:', '05/01/2015', 'Seller:', 'Hutchinson', 'PLC', 'Client:', 'Chapman-Long', 'Total', '$', '501,40', '$', '50,14', '$', '551,54'], 'bboxes': [[136, 77, 266, 103], [283, 83, 336, 103], [354, 77, 534, 103], [136, 146, 205, 168], [218, 145, 246, 168], [258, 145, 341, 168], [805, 145, 976, 171], [135, 447, 252, 472], [144, 510, 296, 531], [310, 510, 357, 531], [828, 447, 946, 472], [836, 510, 1043, 537], [503, 1976, 565, 1993], [948, 1976, 960, 1996], [971, 1976, 1054, 1996], [1148, 1976, 1160, 1996], [1171, 1976, 1239, 1996], [1396, 1976, 1409, 1996], [1420, 1977, 1503, 1996]], 'ner_tags': [0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5]}", "{'id': 45, 'file_name': 'invoice_46.jpg', 'tokens': ['Invoice', 'no:', '65809258', 'Date', 'of', 'issue:', '10/03/2014', 'Seller:', 'West', 'PLC', 'Client:', 'Lyons', 'and', 'Sons', 'Total', '$', '46', '937,53', '$', '4', '693,75', '$', '51'

# Run the model

## Model set up

In [26]:
from transformers import AutoProcessor

# we'll use the Auto API here - it will load LayoutLMv3Processor behind the scenes,
# based on the checkpoint we provide from the hub
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

Using TensorFlow backend.


Some helper functions:

In [27]:
return_entity_level_metrics = False

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "tokens"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
# unique labels.
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list


def unnormalize_box(bbox, width, height):
    return [
        width * (bbox[0] / 1000),
        height * (bbox[1] / 1000),
        width * (bbox[2] / 1000),
        height * (bbox[3] / 1000),
    ]


# Tokenize the inputs
def prepare_examples(examples):
    images = examples[image_column_name]
    words = examples[text_column_name]
    boxes = examples[boxes_column_name]
    word_labels = examples[label_column_name]

    encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
                       truncation=True, padding="max_length")

    return encoding


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }


if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
    # No need to convert the labels since they are already ints.
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
else:
    label_list = get_label_list(dataset["train"][label_column_name])
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
num_labels = len(label_list)

Tokenize train and test datasets

In [28]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

# we need to define custom features for `set_format` (used later on) to work properly
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

train_dataset = dataset["train"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)
eval_dataset = dataset["test"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Load seqeval metric

In [29]:
from evaluate import load

metric = load("seqeval")


In [30]:
from transformers import LayoutLMv3ForTokenClassification, TrainingArguments, Trainer

# Load the pre-trained model with your label mappings
model = LayoutLMv3ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv3-base",
    id2label=id2label,
    label2id=label2id
)




Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
import transformers
print("Transformers version:", transformers.__version__)
print("Transformers location:", transformers.__file__)


Transformers version: 4.52.0.dev0
Transformers location: c:\Users\TMesa\miniforge3\Lib\site-packages\transformers\__init__.py


In [32]:
from transformers import TrainingArguments, Trainer

train_limit = 1000
save_interval = train_limit/5

#come back to this
training_args = TrainingArguments(
    output_dir=f"{current_dir}/checkpoints-IdenticalSubset", #output directory where the best model will be saved
    num_train_epochs=5,
    max_steps=train_limit,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-5,
    save_steps=save_interval,
    logging_steps=save_interval,
)


## Train the model

In [33]:
from transformers.data.data_collator import default_data_collator

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [34]:
trainer.train()



Step,Training Loss
200,0.1262
400,0.0008
600,0.0005
800,0.0004
1000,0.0003




TrainOutput(global_step=1000, training_loss=0.025656472101807593, metrics={'train_runtime': 3344.8778, 'train_samples_per_second': 1.196, 'train_steps_per_second': 0.299, 'total_flos': 1025151992580096.0, 'train_loss': 0.025656472101807593, 'epoch': 111.11111111111111})

Script automatically logs metrics to wandb

API key to run: e9f74b7d71ba2474bb183a627a10a5a4a511bd09

## Evaluate the model

In [35]:
trainer.evaluate()



  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.15323524177074432,
 'eval_precision': 0.9875,
 'eval_recall': 0.9875,
 'eval_f1': 0.9875,
 'eval_accuracy': 0.9854651162790697,
 'eval_runtime': 1.9897,
 'eval_samples_per_second': 8.042,
 'eval_steps_per_second': 2.01,
 'epoch': 111.11111111111111}

# Use the model (inference)

Steps:

1) Load the Processor and Fine-Tuned Model.

2) Prepare the Input and Run the Model

3) Output the image's key features

NOTE: This pipeline can be easily adapted to incorporate the model classifier as well.

In [36]:
#!pip install --upgrade --force-reinstall transformers==4.29.2

In [37]:
#!rm -rf ~/.cache/huggingface/transformers

In [38]:
# Load an example invoice image
image_path = f"{images_path}/invoice_25.jpg"
image = Image.open(image_path).convert("RGB")

In [39]:
def find_latest_checkpoint():
    checkpoint_dir = f"{current_dir}/checkpoints-IdenticalSubset"
    if not os.path.exists(checkpoint_dir):
        print(f"No checkpoints directory found at {checkpoint_dir}")
        return None

    checkpoints = []
    for item in os.listdir(checkpoint_dir):
        # Look for directories matching the pattern "checkpoint-<number>"
        match = re.match(r"checkpoint-(\d+)", item)
        if match and os.path.isdir(os.path.join(checkpoint_dir, item)):
            number = int(match.group(1))
            checkpoints.append(number)

    if not checkpoints:
        print("No checkpoints found")
        return None

    latest_number = max(checkpoints)
    latest_checkpoint = f"checkpoint-{latest_number}"
    print(f"Latest checkpoint: {latest_checkpoint}")
    return os.path.join(checkpoint_dir, latest_checkpoint)

# Find and print the latest checkpoint
latest_checkpoint_path = find_latest_checkpoint()

Latest checkpoint: checkpoint-1000


In [53]:
from transformers import AutoModelForTokenClassification, LayoutLMv3Processor, LayoutLMv3Tokenizer
import torch

model = AutoModelForTokenClassification.from_pretrained(latest_checkpoint_path) #file with the trained weights

In [41]:
#Run tesseract to generate an HOCR file
output_folder = "/layoutlmv3_hocr_output"
os.makedirs(output_folder, exist_ok=True)

image_basename = os.path.basename(image_path).split('.')[0]
base_name = os.path.join(output_folder, image_basename)

# Run Tesseract to generate a HOCR file
pytesseract.run_tesseract(image_path, base_name, extension='box', lang=None, config="hocr")
hocr_file = base_name + '.hocr'

In [42]:
hocr_df = hocr_to_dataframe(hocr_file)
words = hocr_df['word'].tolist()   # List of tokens
boxes = hocr_df['coords'].tolist()   # List of bounding boxes


In [43]:
print(words)

['Invoice', 'no:', '10372826', 'Date', 'of', 'issue:', '05/15/2019', 'Seller:', 'Client:', 'Graham,', 'Briggs', 'and', 'Anderson', 'Moore', 'Group', '2056', 'Martin', 'Extensions', '89000', 'Christopher', 'Brook', 'Port', 'Christopherton,', 'VA', '87978', 'Port', 'Richardmouth,', 'CA', '87376', 'Tax', 'Id:', '975-98-4861', 'Tax', 'Id:', '978-80-2701', 'IBAN:', 'GB56SDLA43798626107179', 'ITEMS', 'No.', 'Description', 'Qty', 'UM', 'Net', 'price', 'Net', 'worth', 'VAT', '[%]', 'Gross', 'worth', '1.', 'NEW', 'TOMMY', 'HILFIGER', 'RED', '5,00', 'each', '34,50', '172,50', '10%', '189,75', 'WHITE', 'BLUE', '#5', 'COLOR', 'BLOCK', 'BEACH', 'SANDALS', 'BOYS', 'KIDS', '8/9', '2.', '10"', 'Cuphead', 'Game', 'Soft', 'Plush', '1,00', 'each', '26,99', '26,99', '10%', '29,69', 'Doll', 'Cuphead', 'Mugman', 'Mecup', 'Brocup', 'kids', 'Gift', 'US', 'Stock', 'Sh', '2', 'Pair-Wonder', 'Nation', 'Childrens', '4,00', 'each', '12,99', '51,96', '10%', '57,16', 'Tucker', 'Boots', 'NEW', 'Size', '10', 'Black', 

In [44]:
print(boxes)

[[136, 77, 266, 103], [283, 83, 336, 103], [355, 77, 534, 103], [136, 146, 205, 168], [218, 145, 246, 168], [258, 145, 341, 168], [805, 145, 976, 171], [135, 443, 252, 484], [828, 447, 946, 472], [143, 510, 259, 534], [274, 510, 358, 537], [369, 510, 417, 531], [429, 510, 558, 531], [837, 511, 920, 531], [931, 510, 1013, 537], [143, 546, 211, 567], [224, 546, 308, 567], [321, 546, 468, 567], [836, 546, 921, 567], [933, 546, 1094, 573], [1105, 546, 1183, 567], [144, 583, 197, 603], [208, 582, 420, 609], [432, 583, 470, 603], [481, 582, 565, 603], [837, 583, 890, 603], [902, 582, 1102, 606], [1115, 582, 1151, 603], [1162, 582, 1247, 603], [141, 654, 191, 674], [203, 653, 233, 674], [247, 653, 421, 674], [834, 654, 884, 674], [896, 653, 926, 674], [939, 653, 1114, 674], [144, 689, 215, 709], [228, 688, 622, 709], [136, 763, 246, 789], [162, 845, 200, 873], [242, 845, 382, 875], [681, 845, 724, 875], [781, 850, 817, 866], [903, 850, 945, 866], [955, 849, 1015, 871], [1062, 850, 1103, 866],

In [45]:
dummy_word_labels = [0] * len(words)
print(dummy_word_labels)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [46]:
import torch
from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3Processor

# Label mapping (same as used in training)
id2label = {
    0: "Invoice number",
    1: "Invoice date",
    2: "Due date",
    3: "Issuer name",
    4: "Recipient name",
    5: "Total amount"
}

model_path = latest_checkpoint_path
model = LayoutLMv3ForTokenClassification.from_pretrained(model_path)
processor = LayoutLMv3Processor.from_pretrained(model_path)

width, height = image.size  # Get image width and height
normalized_boxes = [[int(1000 * (x / width)), int(1000 * (y / height)),
                     int(1000 * (x2 / width)), int(1000 * (y2 / height))]
                    for x, y, x2, y2 in boxes]  # Normalize boxes

# Prepare the inputs for the model
encoding = processor(image, words, boxes=normalized_boxes, return_tensors="pt")  # Use normalized_boxes

# Run inference
model.eval()
with torch.no_grad():
    outputs = model(**encoding)

# Get predicted label IDs for each token and map them to label names
logits = outputs.logits  # shape: (batch_size, sequence_length, num_labels)
predicted_ids = torch.argmax(logits, dim=-1).squeeze().tolist()

# Ensure predicted_ids is a list even for a single instance
if isinstance(predicted_ids, int):
    predicted_ids = [predicted_ids]

predicted_labels = [id2label.get(pred, "O") for pred in predicted_ids]

# Aggregate tokens by their predicted label
aggregated_fields = {}
for token, label in zip(words, predicted_labels):
    if label != "O":  # Skip tokens with no assigned field
        if label in aggregated_fields:
            aggregated_fields[label] += " " + token
        else:
            aggregated_fields[label] = token

# Output the aggregated result for each invoice field
for field in id2label.values():
    value = aggregated_fields.get(field, "")
    print(f"{field}: {value}")

Invoice number: no: 10372826 Date of issue: 05/15/2019 Seller: Client:
Invoice date: Graham, Briggs and Anderson Moore Group 2056 Martin Extensions
Due date: 
Issuer name: Invoice 89000 Christopher Christopherton, VA 87978 Port Richardmouth, Tax Id: 975-98-4861 Tax ITEMS No. Description Qty UM Net price Net 2. 10" Cuphead Game Soft Plush 1,00 each 26,99 26,99 10% 29,69 Doll Cuphead Mugman Brocup 51,96 10% 57,16 Tucker Boots NEW Size 10 Black Easy 251,45 $ 25,15 $ 276,60
Recipient name: Brook Port CA 87376 Id: 978-80-2701 IBAN: GB56SDLA43798626107179 worth VAT [%] Gross worth 1. NEW 172,50 BLUE #5 COLOR BLOCK BEACH SANDALS BOYS KIDS 8/9
Total amount: TOMMY HILFIGER RED 5,00 each 34,50 10% 189,75 WHITE Mecup kids Gift US Stock Sh 2 Pair-Wonder Nation Childrens 4,00 each 12,99 on/off SUMMARY VAT [%] Net worth VAT Gross worth 10% 251,45 25,15 276,60 Total $


In [47]:
LayoutLMv3Tokenizer.added_tokens_encoder = property(lambda self: {})

In [48]:
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base", use_fast=False)

In [49]:
# Load the processor with OCR enabled
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", tokenizer=tokenizer, apply_ocr=False)

In [50]:
# Get image width and height
width, height = image.size

# Normalize the bounding boxes to the range 0-1000
normalized_boxes = [[int(1000 * (x / width)), int(1000 * (y / height)),
                     int(1000 * (x2 / width)), int(1000 * (y2 / height))]
                    for x, y, x2, y2 in boxes]

# Process image
#encoding = processor([image], [words], [boxes], [dummy_word_labels], return_tensors="pt", is_split_into_words=True)
encoding = processor(image, words, boxes=normalized_boxes, word_labels=dummy_word_labels, return_tensors="pt", is_split_into_words=True)


print(processor.tokenizer.convert_ids_to_tokens(encoding.input_ids[0]))
print(encoding.bbox[0])

# Run inference
with torch.no_grad():
    outputs = model(**encoding)

logits = outputs.logits
predictions = logits.argmax(-1).squeeze().tolist()

# Convert token IDs to readable tokens
tokens = processor.tokenizer.convert_ids_to_tokens(encoding.input_ids[0])

# Label mapping (same as used in training)
id2label = {
    0: "Invoice number",
    1: "Invoice date",
    2: "Due date",
    3: "Issuer name",
    4: "Recipient name",
    5: "Total amount"
}

labels = [id2label[pred] for pred in predictions]

# Group tokens by entity type
extracted_info = {}
for token, label in zip(tokens, labels):
    if label != "O":
        extracted_info.setdefault(label, []).append(token)

['<s>', 'ĠInv', 'oice', 'Ġno', ':', 'Ġ10', '37', '28', '26', 'ĠDate', 'Ġof', 'Ġissue', ':', 'Ġ05', '/', '15', '/', '2019', 'ĠSeller', ':', 'ĠClient', ':', 'ĠGraham', ',', 'ĠBriggs', 'Ġand', 'ĠAnderson', 'ĠMoore', 'ĠGroup', 'Ġ20', '56', 'ĠMartin', 'ĠExtensions', 'Ġ89', '000', 'ĠChristopher', 'ĠBrook', 'ĠPort', 'ĠChristopher', 'ton', ',', 'ĠVA', 'Ġ8', '79', '78', 'ĠPort', 'ĠRichard', 'mouth', ',', 'ĠCA', 'Ġ87', '376', 'ĠTax', 'ĠId', ':', 'Ġ9', '75', '-', '98', '-', '48', '61', 'ĠTax', 'ĠId', ':', 'Ġ978', '-', '80', '-', '27', '01', 'ĠIB', 'AN', ':', 'ĠGB', '56', 'SD', 'LA', '437', '98', '626', '107', '179', 'ĠIT', 'EMS', 'ĠNo', '.', 'ĠDescription', 'ĠQ', 'ty', 'ĠUM', 'ĠNet', 'Ġprice', 'ĠNet', 'Ġworth', 'ĠVAT', 'Ġ[', '%]', 'ĠGross', 'Ġworth', 'Ġ1', '.', 'ĠNEW', 'ĠTOM', 'MY', 'ĠH', 'IL', 'FIG', 'ER', 'ĠRED', 'Ġ5', ',', '00', 'Ġeach', 'Ġ34', ',', '50', 'Ġ172', ',', '50', 'Ġ10', '%', 'Ġ189', ',', '75', 'ĠWHITE', 'ĠBL', 'UE', 'Ġ#', '5', 'ĠCOL', 'OR', 'ĠBL', 'OCK', 'ĠBE', 'ACH', 'ĠS', 'AND', '

In [68]:
# Display raw result
print("Structured Raw Information:")
for key, token_list in extracted_info.items():
    clean_tokens = [token.replace("Ġ", "") for token in token_list if token not in ["<s>", "</s>"]]
    print(f"{key}:{" " * (18 - len(key))}{' '.join(clean_tokens)}")

Structured Raw Information:
Issuer name:       Seller : Graham , Briggs and Anderson 20 56 Martin Extensions Port Christopher ton , VA 8 79 78 IB AN : GB 56 SD LA 437 98 626 107 179 IT EMS No Description 1 . NEW TOM MY H IL FIG ER RED WHITE BL UE # 5 COL OR BL OCK BE ACH S AND ALS BO YS K IDS 8 / 9 10 " Cup head Game Soft Pl ush Doll Cup head Mug man M ec up Bro cup kids Gift US Stock Pair - Wonder Nation Children s Tucker Boots NEW Size 10 Black Easy on / off SUM M ARY
Invoice number:    Inv oice no : 10 37 28 26
Invoice date:      Date of issue : 05 / 15 / 2019
Recipient name:    Client : Moore Group 89 000 Christopher Brook Port Richard mouth , CA 87 376 98 Tax Id : 978 - 80 - 27 01
Total amount:      Tax Id : 9 75 - - 48 61 . Q ty UM Net price Net worth VAT [ %] Gross worth 5 , 00 each 34 , 50 172 , 50 10 % 189 , 75 2 . 1 , 00 each 26 , 99 26 , 99 10 % 29 , 69 Sh 2 4 , 00 each 12 , 99 51 , 96 10 % 57 , 16 VAT [ %] Net worth VAT Gross worth 10 % 251 , 45 25 , 15 276 , 60 Total $ 251

In [62]:
# Create a dictionary to store the structured data
structured_data = {
    "Invoice number": [],
    "Invoice date": [],
    "Due date": [],
    "Issuer name": [],
    "Recipient name": [],
    "Total amount": []
}

# Process each invoice in final_list
for invoice in final_list:
    invoice_data = {}

    # Get tokens, bboxes, and ner_tags
    tokens = invoice.get('tokens', [])
    ner_tags = invoice.get('ner_tags', [])

    # Group tokens by their NER tags
    current_label = None
    current_text = []

    for token, tag in zip(tokens, ner_tags):
        label = id2label[tag]
        if label != current_label:
            if current_label and current_text:
                invoice_data[current_label] = ' '.join(current_text)
            current_label = label
            current_text = [token]
        else:
            current_text.append(token)

    # Don't forget to add the last group
    if current_label and current_text:
        invoice_data[current_label] = ' '.join(current_text)

    # Clean and process each field
    for field in structured_data.keys():
        value = invoice_data.get(field, '')

        # Field-specific cleaning
        if field == "Invoice number":
            value = re.sub(r'[^0-9]', '', value)
        elif field == "Invoice date" or field == "Due date":
            # Remove all spaces and non-numeric characters except /
            value = re.sub(r'[^0-9/]', '', value)
            # Keep only the last 10 characters (MM/DD/YYYY format)
            value = value[-10:] if len(value) >= 10 else value
            try:
                value = datetime.datetime.strptime(value, '%m/%d/%Y').date()
            except ValueError:
                value = None
        elif field == "Total amount":
            # Find the last currency symbol ($ or £ or €) and keep it plus what comes after it
            last_currency_pos = max(value.rfind('$'), value.rfind('£'), value.rfind('€'))
            if last_currency_pos != -1:
                currency_symbol = value[last_currency_pos]  # Keep the currency symbol
                amount_str = value[last_currency_pos + 1:]  # Get the amount part
                # Clean up the amount string
                amount_str = re.sub(r'[^0-9.,]', '', amount_str)
                # Replace comma with dot if comma is used as decimal separator
                if ',' in amount_str and '.' not in amount_str:
                    amount_str = amount_str.replace(',', '.')
                # Remove any remaining commas (thousand separators)
                amount_str = amount_str.replace(',', '')
                try:
                    amount = float(amount_str)
                    value = f"{currency_symbol}{amount}"  # Combine symbol and amount
                except ValueError:
                    value = None
            else:
                value = None
        elif field in ["Issuer name", "Recipient name"]:
            # Split into words and remove the first word (Seller: or Client:)
            words = value.split()
            if len(words) > 1:
                value = ' '.join(words[1:])

        structured_data[field].append(value)

# Create the final DataFrame
df = pd.DataFrame(structured_data)

# Display the DataFrame
display(df.head())

Unnamed: 0,Invoice number,Invoice date,Due date,Issuer name,Recipient name,Total amount
0,40378170,2012-10-15,,"Patel, Thompson and Montgomery","Jackson, Odonnell and Jackson",$8.25
1,61356291,2012-09-06,,"Chapman, Kim and Green",Rodriguez-Stevens,$212.09
2,16662010,2016-08-28,,Smith-Cook,Snyder-Johnson,$2259.1
3,19471831,2014-04-09,,Palmer Ltd,"Rios, Oneill and Rowe",$44745.59
4,22083742,2014-03-01,,"Smith, Schaefer and Gonzalez",Smith-Petersen,$7920.0
