In [None]:
!pip install -U transformers
!pip install -U datasets evaluate seqeval pillow

In [None]:
import os, json
from pathlib import Path
from PIL import Image

In [None]:
def parse_boxes(file_path):
    words, boxes = [], []
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            parts = line.strip().split(",", 8)
            if len(parts) < 9:
                continue
            x0,y0,x1,y1,x2,y2,x3,y3 = map(int, parts[:8])
            text = parts[8].strip()
            words.append(text)
            boxes.append([min(x0,x2), min(y0,y2), max(x1,x3), max(y1,y3)])
    return words, boxes

In [None]:
def normalize_boxes(boxes, width, height):
    norm = []
    for x0,y0,x1,y1 in boxes:
        norm.append([
            int(1000 * x0 / width),
            int(1000 * y0 / height),
            int(1000 * x1 / width),
            int(1000 * y1 / height)
        ])
    return norm

In [None]:
def make_jsonl(split_dir, out_file):
    img_dir = Path(split_dir) / "img"
    box_dir = Path(split_dir) / "box"
    ent_dir = Path(split_dir) / "entities"
    out = []
    for img_path in img_dir.iterdir():
        base = img_path.stem
        box_path = box_dir / f"{base}.txt"
        ent_path = ent_dir / f"{base}.txt"
        if not box_path.exists() or not ent_path.exists():
            continue

        words, boxes = parse_boxes(box_path)
        width, height = Image.open(img_path).size
        boxes = normalize_boxes(boxes, width, height)
        labels = ["O"] * len(words)

        entities = json.load(open(ent_path))
        for key, value in entities.items():
            entity_tokens = value.replace(",", " ").split()
            entity_tokens = [t for t in entity_tokens if t.strip()]
            if not entity_tokens:
                continue
                
            for i, w in enumerate(words):
                w_tokens = w.replace(",", " ").split()
                w_tokens = [t for t in w_tokens if t.strip()]
                if not w_tokens:
                    continue
                    
                overlap = len(set(w_tokens) & set(entity_tokens))
                if overlap >= max(1, len(entity_tokens) // 2):
                    labels[i] = f"B-{key.upper()}"

        first_found_company = False
        first_found_address = False
        
        for i in labels:
            if i == "B-COMPANY" and first_found_company:
                labels[i] = "I-COMPANY"
            if i == "B-ADDRESS" and first_found_address:
                labels[i] = "I-ADDRESS"

        out.append({
            "id": base,
            "image": str(img_path),
            "words": words,
            "boxes": boxes,
            "labels": labels
        })

    with open(out_file, "w") as f:
        for item in out:
            f.write(json.dumps(item) + "\n")

In [None]:
make_jsonl("/kaggle/input/recipts-data/dataset/train", "/kaggle/working/train.jsonl")
make_jsonl("/kaggle/input/recipts-data/dataset/test", "/kaggle/working/test.jsonl")

In [None]:
from datasets import load_dataset

dataset = load_dataset('json', data_files={
    'train': '/kaggle/working/train.jsonl',
    'test': '/kaggle/working/test.jsonl'
})

In [None]:
dataset['train'][0]

In [None]:
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from transformers import TrainingArguments, Trainer
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

labels = sorted({l for ex in dataset['train'] for l in ex['labels']})
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

In [None]:
id2label

In [None]:
label2id

In [None]:
model = LayoutLMv3ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv3-base",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

In [None]:
def preprocess(example):
    image = Image.open(example['image']).convert("RGB")

    encoding = processor(
        image,
        boxes=example['boxes'],
        text=example['words'],
        word_labels=[label2id[l] for l in example['labels']],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    return {k: v.squeeze() for k, v in encoding.items()}

encoded_dataset = dataset.map(preprocess, remove_columns=dataset['train'].column_names)

In [None]:
import numpy as np

In [None]:
from transformers import TrainingArguments, Trainer
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "accuracy": accuracy_score(true_labels, true_predictions),
    }

In [None]:
from huggingface_hub import login
login()

In [None]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/layoutlmv3-receipts",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=5,
    fp16=True,
    disable_tqdm=False,
    report_to="none",
    push_to_hub=True,
    hub_model_id="Sameed1/smdk-layoutlmv3-receipts",
    hub_strategy="end",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
eval_results = trainer.evaluate()
eval_results

In [None]:
from huggingface_hub import login
login()

In [None]:
trained_model = LayoutLMv3ForTokenClassification.from_pretrained("/kaggle/working/layoutlmv3-receipts/checkpoint-471")
trained_model

In [None]:
trained_model.push_to_hub("smdk-layoutlmv3-receipts")
processor.push_to_hub("smdk-layoutlmv3-receipts")