In [1]:
!pip install -U transformers
!pip install -U datasets evaluate seqeval pillow

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hDownloading huggingface_hub-0.35.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K  

In [2]:
import os, json
from pathlib import Path
from PIL import Image

In [3]:
def parse_boxes(file_path):
    words, boxes = [], []
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            parts = line.strip().split(",", 8)
            if len(parts) < 9:
                continue
            x0,y0,x1,y1,x2,y2,x3,y3 = map(int, parts[:8])
            text = parts[8].strip()
            words.append(text)
            boxes.append([min(x0,x2), min(y0,y2), max(x1,x3), max(y1,y3)])
    return words, boxes

In [4]:
def normalize_boxes(boxes, width, height):
    norm = []
    for x0,y0,x1,y1 in boxes:
        norm.append([
            int(1000 * x0 / width),
            int(1000 * y0 / height),
            int(1000 * x1 / width),
            int(1000 * y1 / height)
        ])
    return norm

In [5]:
def make_jsonl(split_dir, out_file):
    img_dir = Path(split_dir) / "img"
    box_dir = Path(split_dir) / "box"
    ent_dir = Path(split_dir) / "entities"
    out = []
    for img_path in img_dir.iterdir():
        base = img_path.stem
        box_path = box_dir / f"{base}.txt"
        ent_path = ent_dir / f"{base}.txt"
        if not box_path.exists() or not ent_path.exists():
            continue

        words, boxes = parse_boxes(box_path)
        width, height = Image.open(img_path).size
        boxes = normalize_boxes(boxes, width, height)
        labels = ["O"] * len(words)

        entities = json.load(open(ent_path))
        for key, value in entities.items():
            entity_tokens = value.replace(",", " ").split()
            entity_tokens = [t for t in entity_tokens if t.strip()]
            if not entity_tokens:
                continue
                
            for i, w in enumerate(words):
                w_tokens = w.replace(",", " ").split()
                w_tokens = [t for t in w_tokens if t.strip()]
                if not w_tokens:
                    continue
                    
                overlap = len(set(w_tokens) & set(entity_tokens))
                if overlap >= max(1, len(entity_tokens) // 2):
                    labels[i] = f"B-{key.upper()}"

        first_found_company = False
        first_found_address = False
        
        for i in labels:
            if i == "B-COMPANY" and first_found_company:
                labels[i] = "I-COMPANY"
            if i == "B-ADDRESS" and first_found_address:
                labels[i] = "I-ADDRESS"

        out.append({
            "id": base,
            "image": str(img_path),
            "words": words,
            "boxes": boxes,
            "labels": labels
        })

    with open(out_file, "w") as f:
        for item in out:
            f.write(json.dumps(item) + "\n")

In [6]:
make_jsonl("/kaggle/input/recipts-data/dataset/train", "/kaggle/working/train.jsonl")
make_jsonl("/kaggle/input/recipts-data/dataset/test", "/kaggle/working/test.jsonl")

In [7]:
from datasets import load_dataset

dataset = load_dataset('json', data_files={
    'train': '/kaggle/working/train.jsonl',
    'test': '/kaggle/working/test.jsonl'
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [8]:
dataset['train'][0]

{'id': 'X51007231344',
 'image': '/kaggle/input/recipts-data/dataset/train/img/X51007231344.jpg',
 'words': ['UNIHAKKA INTERNATIONAL SDN BHD',
  '05 MAY 2018 18:21',
  '(867388-U)',
  '12, JALAN TAMPOI 7/4,KAWASAN PERINDUSTRIAN',
  'TAMPOI,81200 JOHOR BAHRU,JOHOR',
  'TAXINVOICE',
  'INVOICE # : OR18050502160248',
  'ITEM',
  'QTY',
  'TOTAL',
  'SR I00100000035- 1 MEAT + 3 VEGE',
  '$7.10',
  '1',
  '$7.10',
  'SR I00100000170- IMPORTED VEGGIES',
  '$1.60',
  '1',
  '$1.60',
  'SR I00100000099- COKE',
  '$2.50',
  '1',
  '$2.50',
  'TOTAL AMOUNT: $11.20',
  'GST @6%: $0.63',
  'NETT TOTAL: $11.20',
  'PAYMENT MODE',
  'AMOUNT',
  'CASH',
  '$11.20',
  'CHANGE',
  '$0.00',
  'GST SUMMARY',
  'AMOUNT($)',
  'TAX($)',
  'SR = GST @6%',
  '10.57',
  '0.63',
  'GST REG #000656195584',
  'BAR WANG RICE@PERMAS JAYA',
  '(PRICE INCLUSIVE OF GST)',
  'THANK YOU & COME AGAIN!',
  'LIKE AND FOLLOW US ON FACEBOOK!',
  'FACEBOOK.COM/BARWANGRICE'],
 'boxes': [[336, 195, 606, 204],
  [420, 206, 521,

In [9]:
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from transformers import TrainingArguments, Trainer
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

2025-10-14 18:15:00.197398: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760465700.639140      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760465700.773299      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

labels = sorted({l for ex in dataset['train'] for l in ex['labels']})
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

In [11]:
id2label

{0: 'B-ADDRESS', 1: 'B-COMPANY', 2: 'B-DATE', 3: 'B-TOTAL', 4: 'O'}

In [12]:
label2id

{'B-ADDRESS': 0, 'B-COMPANY': 1, 'B-DATE': 2, 'B-TOTAL': 3, 'O': 4}

In [13]:
model = LayoutLMv3ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv3-base",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def preprocess(example):
    image = Image.open(example['image']).convert("RGB")

    encoding = processor(
        image,
        boxes=example['boxes'],
        text=example['words'],
        word_labels=[label2id[l] for l in example['labels']],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    return {k: v.squeeze() for k, v in encoding.items()}

encoded_dataset = dataset.map(preprocess, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/626 [00:00<?, ? examples/s]

Map:   0%|          | 0/347 [00:00<?, ? examples/s]

In [15]:
import numpy as np

In [16]:
from transformers import TrainingArguments, Trainer
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }

In [19]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/layoutlmv3-receipts",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=5,
    fp16=True,
    disable_tqdm=False,
    report_to="none",
    push_to_hub=True,
    hub_model_id="Sameed1/smdk-layoutlmv3-receipts",
    hub_strategy="end",
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
5,1.5645
10,0.7071
15,0.4896
20,0.408
25,0.3771
30,0.3407
35,0.2979
40,0.2442
45,0.2445
50,0.2222




TrainOutput(global_step=471, training_loss=0.12025927238941446, metrics={'train_runtime': 508.0948, 'train_samples_per_second': 3.696, 'train_steps_per_second': 0.927, 'total_flos': 495041961535488.0, 'train_loss': 0.12025927238941446, 'epoch': 3.0})

In [23]:
eval_results = trainer.evaluate()
eval_results



{'eval_loss': 0.059942491352558136,
 'eval_precision': 0.8875767595654228,
 'eval_recall': 0.9446958270487682,
 'eval_f1': 0.9152459814905016,
 'eval_accuracy': 0.9817569013481703,
 'eval_runtime': 60.6166,
 'eval_samples_per_second': 5.725,
 'eval_steps_per_second': 1.435,
 'epoch': 3.0}

In [26]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
trained_model = LayoutLMv3ForTokenClassification.from_pretrained("/kaggle/working/layoutlmv3-receipts/checkpoint-471")
trained_model

LayoutLMv3ForTokenClassification(
  (layoutlmv3): LayoutLMv3Model(
    (embeddings): LayoutLMv3TextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (x_position_embeddings): Embedding(1024, 128)
      (y_position_embeddings): Embedding(1024, 128)
      (h_position_embeddings): Embedding(1024, 128)
      (w_position_embeddings): Embedding(1024, 128)
    )
    (patch_embed): LayoutLMv3PatchEmbeddings(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
    (encoder): LayoutLMv3Encoder

In [28]:
trained_model.push_to_hub("smdk-layoutlmv3-receipts")
processor.push_to_hub("smdk-layoutlmv3-receipts")

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Sameed1/smdk-layoutlmv3-receipts/commit/00c94defe5f7ab52aab476263ff8a7b5593d2d6c', commit_message='Upload processor', commit_description='', oid='00c94defe5f7ab52aab476263ff8a7b5593d2d6c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Sameed1/smdk-layoutlmv3-receipts', endpoint='https://huggingface.co', repo_type='model', repo_id='Sameed1/smdk-layoutlmv3-receipts'), pr_revision=None, pr_num=None)