In [16]:
import wandb
wandb.login()

In [17]:
import os
import json
from PIL import Image, ImageDraw, ImageFont
from datasets import load_dataset, Features, Sequence, ClassLabel, Value, Array2D, Array3D
from transformers import LayoutLMv3FeatureExtractor, LayoutLMv3TokenizerFast, LayoutLMv3Processor

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [18]:
dataset = {}
#dataset['train'], dataset['test'] = load_dataset('datasets/invoices_limited_25/funsd.py',  split=['train', 'test'])
dataset['train'], dataset['test'] = load_dataset('datasets/passports/passports_all_doctr.py',  split=['train[:5%]', 'test'])

In [None]:
feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=True, )
processor = ErnieLayoutProcessor(image_processor=feature_extractor, tokenizer=tokenizer)

In [19]:
from tokenizers import Tokenizer
feature_extractor = LayoutLMv3FeatureExtractor(ocr_lang="rus", apply_ocr=False)  # apply_ocr is set to True by default
tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
processor = LayoutLMv3Processor(feature_extractor, tokenizer)

In [20]:

labels = ['issued', 'issued_date', 'code', 'number', 'lastname', 'firstname', 'middlename', 'gender', 'bday', 'place', 'other']
# labels = ['Банк получателя', 'Получатель', 'БИК банка получателя', 'Счет банка', 'Счет получателя', 'Всего к оплате', 'Поставщик', 'Покупатель', 'Остальное', 'Номер счета', 'Дата', 'ИНН получателя', 'КПП получателя']
id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}

In [21]:

from datasets.features import ClassLabel

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "words"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
# unique labels.
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
    # No need to convert the labels since they are already ints.
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
else:
    label_list = get_label_list(dataset["train"][label_column_name])
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
num_labels = len(label_list)


def prepare_examples(examples):
  images = [Image.open(path).convert('RGB') for path in examples['image_path']]
  words = examples[text_column_name]
  boxes = examples[boxes_column_name]
  word_labels = examples[label_column_name]

  encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
                       truncation=True, padding="max_length")

  return encoding


In [22]:

from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

# we need to define custom features for `set_format` (used later on) to work properly
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

train_dataset = dataset["train"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)
test_dataset = dataset["test"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

In [23]:
dataset

In [24]:
from torch.utils.data import DataLoader

train_dataset.set_format(type="torch", device="cpu")
test_dataset.set_format(type="torch", device="cpu")
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [25]:
from datasets import load_metric
import numpy as np
from transformers import TrainingArguments, LayoutLMv3ForTokenClassification, Trainer

model = LayoutLMv3ForTokenClassification.from_pretrained('microsoft/layoutlmv3-base', num_labels=len(labels))

# Set id2label and label2id
model.config.id2label = id2label
model.config.label2id = label2id

# Metrics
metric = load_metric("seqeval")
return_entity_level_metrics = True


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }


class RusFunsdTrainer(Trainer):
    def get_train_dataloader(self):
        return train_dataloader

    def get_eval_dataloader(self, test_dataset):
        return test_dataloader


args = TrainingArguments(
    output_dir="run",  # name of directory to store the checkpoints
    num_train_epochs=100,
    warmup_ratio=0.1,
    # warmup_steps=0,
    learning_rate=3e-5,
    lr_scheduler_type ='cosine',

    evaluation_strategy="epoch",
    save_strategy='epoch',
    logging_strategy='epoch',
    load_best_model_at_end=True,

    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,

    metric_for_best_model='overall_f1',

    seed=88
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=args,
    compute_metrics=compute_metrics,
    eval_dataset=test_dataset,
    train_dataset=train_dataset,
)

In [26]:
mem_params = sum([param.nelement()*param.element_size() for param in model.parameters()])
mem_bufs = sum([buf.nelement()*buf.element_size() for buf in model.buffers()])
mem = mem_params + mem_bufs # in bytes
mem / 1024 / 1024 / 1024  # in GB

In [27]:
wandb.init(project="passports",
           config=dict(
           ))

In [28]:
trainer.train()

In [30]:
trainer.save_model("weights/last_run_best")

In [15]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
params

125926027