In [2]:
import os
from matplotlib import pyplot as plt
import cv2
import numpy as np
from PIL import Image
from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2TokenizerFast, LayoutLMv2Processor, \
    LayoutLMv2ForTokenClassification

labels = ['Банк получателя', 'Получатель', 'БИК банка получателя', 'Счет банка', 'Счет получателя', 'Всего к оплате',
          'Поставщик', 'Покупатель', 'Остальное', 'Номер счета', 'Дата', 'ИНН получателя', 'КПП получателя']
id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}

feature_extractor = LayoutLMv2FeatureExtractor(ocr_lang="rus", prepare_pic=None, 
                                               tesseract_config='-l rus --psm 6 -c tessedit_char_blacklist="_-<>|/\„”>=[]{}%*^®()»›`"')
tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")

processor = LayoutLMv2Processor(feature_extractor, tokenizer)
model = LayoutLMv2ForTokenClassification.from_pretrained('weights/distinctive-shadow-74', num_labels=len(labels))


def unnormalize_box(bbox, width, height):
    return [
        width * (bbox[0] / 1000),
        height * (bbox[1] / 1000),
        width * (bbox[2] / 1000),
        height * (bbox[3] / 1000),
    ]


def visualize_predict(path, model, processor, tokenizer):
    # image = prepare_pic(path, 'preprocessing_templates').convert('RGB')
    image = Image.open(path).convert('RGB')
    encoded_inputs = processor(image, padding="max_length", truncation=True, return_tensors="pt")

    model.eval()
    outputs = model(**encoded_inputs)
    predictions = outputs.logits.argmax(-1).squeeze().tolist()
    token_boxes = encoded_inputs.bbox.squeeze().tolist()
    width, height = image.size
    true_predictions = [id2label[prediction] for prediction in predictions]
    true_boxes = [unnormalize_box(box, width, height) for box in token_boxes]

    result = {}
    image = np.array(image)
    for prediction, box, token_id in zip(true_predictions, true_boxes, encoded_inputs['input_ids'][0].tolist()):
        text = tokenizer.decode(token_id)
        x1, y1, x2, y2 = [int(b) for b in box]
        predicted_label = prediction.lower()
        image = cv2.rectangle(image, (x1, y1), (x2, y2), (36, 255, 12), 1)
        cv2.putText(image, predicted_label, (x1, y1 - 2), cv2.FONT_HERSHEY_COMPLEX, 0.3, (255, 33, 33), 1)

        if predicted_label != 'остальное':
            if predicted_label in result:
                if text[:2] == '##':
                    result[predicted_label] += f'{text[2:]}'
                else:
                    result[predicted_label] += f' {text}'
            else:
                result[predicted_label] = text

    return image, result




In [7]:
imgs = []
ds_dir = 'datasets/invoices_limited_35'
for img_name in os.listdir(f'{ds_dir}/testing_data/images/'):
    imgs.append(visualize_predict(f'{ds_dir}/testing_data/images/' + img_name, model, processor, tokenizer))

## LMv3

In [2]:
from transformers import LayoutLMv3FeatureExtractor, LayoutLMv3TokenizerFast, LayoutLMv3Processor, \
    LayoutLMv3ForTokenClassification

feature_extractor = LayoutLMv3FeatureExtractor(ocr_lang="rus", apply_ocr=True)
tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
processor = LayoutLMv3Processor(feature_extractor, tokenizer)
model = LayoutLMv3ForTokenClassification.from_pretrained('weights/eager-durian-5-7988-lmv3-25-9', num_labels=len(labels))




In [3]:
imgs = []
ds_dir = 'datasets/invoices_limited_35'
for img_name in os.listdir(f'{ds_dir}/testing_data/images/'):
    imgs.append(visualize_predict(f'{ds_dir}/testing_data/images/' + img_name, model, processor, tokenizer))



torch.Size([1, 512, 13])




torch.Size([1, 512, 13])




torch.Size([1, 512, 13])




torch.Size([1, 512, 13])




torch.Size([1, 512, 13])




torch.Size([1, 512, 13])




torch.Size([1, 512, 13])




torch.Size([1, 512, 13])




torch.Size([1, 512, 13])


## XLM

In [12]:
from transformers import LayoutXLMTokenizer, LayoutXLMProcessor

feature_extractor = LayoutLMv2FeatureExtractor(ocr_lang="rus", apply_ocr=True, do_resize=True, size=224)  # apply_ocr is set to True by default
tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
processor =  LayoutXLMProcessor(feature_extractor, tokenizer)
model = LayoutLMv2ForTokenClassification.from_pretrained("weights/twinkling-goat-1-8245-xlm-35-9", num_labels=len(labels))



In [15]:
imgs = []
ds_dir = 'datasets/invoices_limited_35'
for img_name in os.listdir(f'{ds_dir}/testing_data/images/'):
    imgs.append(visualize_predict(f'{ds_dir}/testing_data/images/' + img_name, model, processor, tokenizer))

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


torch.Size([1, 234, 13])
torch.Size([1, 379, 13])


RuntimeError: The expanded size of the tensor (725) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 725].  Tensor sizes: [1, 514]

## ERNIE-Layout

In [17]:
from ernie.networks import ErnieLayoutConfig, ErnieLayoutForQuestionAnswering, \
    ErnieLayoutProcessor, ErnieLayoutTokenizerFast, ErnieLayoutForTokenClassification
from transformers.models.layoutlmv3 import LayoutLMv3ImageProcessor, LayoutLMv3FeatureExtractor

pretrain_torch_model_or_path = "Norm/ERNIE-Layout-Pytorch"
tokenizer = ErnieLayoutTokenizerFast.from_pretrained(pretrained_model_name_or_path=pretrain_torch_model_or_path)
feature_extractor = LayoutLMv3FeatureExtractor(ocr_lang="rus", prepare_pic=None, 
                                               tesseract_config='-l rus --psm 6 -c tessedit_char_blacklist="_-<>|/\„”>=[]{}%*^®()»›`"')
processor = ErnieLayoutProcessor(image_processor=feature_extractor, tokenizer=tokenizer)

config = ErnieLayoutConfig.from_pretrained(pretrained_model_name_or_path=pretrain_torch_model_or_path)
model = ErnieLayoutForTokenClassification.from_pretrained(
    pretrained_model_name_or_path=pretrain_torch_model_or_path,
    config=config,
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'ErnieLayoutTokenizerFast'.
You are using a model of type xlnet to instantiate a model of type ernie_layout. This is not supported for all configurations of models and can yield errors.
Some weights of ErnieLayoutForTokenClassification were not initialized from the model checkpoint at Norm/ERNIE-Layout-Pytorch and are newly initialized: ['ernie_layout.visual.backbone.batch_norm1.num_batches_tracked', 'classifier.bias', 'ernie_layout.visual.backbone.resnet.layer0.1.batch_norm2.num_batches_tracked', 'ernie_layout.visual.backbone.resnet.layer0.0.batch_norm1.num_batches_tracked', 'ernie_layout.visual.backbone.resnet.layer0.2.batch_norm3.num_batches_tracked', 'ernie_layout.visual.backbone.resnet.layer0.1.batch_norm1.num_batc

In [14]:
imgs = []
ds_dir = 'datasets/invoices_limited_35'
for img_name in os.listdir(f'{ds_dir}/testing_data/images/'):
    imgs.append(visualize_predict(f'{ds_dir}/testing_data/images/' + img_name, model, processor, tokenizer))

In [18]:

tokenizer.tokenize("департамент московоской области замосковерцкого района долговязово")

TypeError: PreTokenizedEncodeInput must be Union[PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]]

## LayoutLMv2 FUNSD

In [6]:
import os
from matplotlib import pyplot as plt
import cv2
import numpy as np
from PIL import Image
from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2TokenizerFast, LayoutLMv2Processor, \
    LayoutLMv2ForTokenClassification

labels = ["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"]
id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}

feature_extractor = LayoutLMv2FeatureExtractor(ocr_lang="eng", prepare_pic=None, 
                                               tesseract_config='-l eng --psm 6 -c tessedit_char_blacklist="_-<>|/\„”>=[]{}%*^®()»›`"')
tokenizer = LayoutLMv2TokenizerFast.from_pretrained("nielsr/layoutlmv2-finetuned-funsd")

processor = LayoutLMv2Processor(feature_extractor, tokenizer)
model = LayoutLMv2ForTokenClassification.from_pretrained('nielsr/layoutlmv2-finetuned-funsd')

imgs = []
ds_dir = 'raw_data/eng_report_for_test'
for img_name in os.listdir(f'{ds_dir}'):
    imgs.append(visualize_predict(f'{ds_dir}/' + img_name, model, processor, tokenizer))



## Visualize

In [8]:
from ipywidgets import interact, interact_manual
@interact
def vis(index=(0, len(imgs)-1)):
    for key in imgs[index][1]:
        print(f'{key.upper()}: {imgs[index][1][key]}')
    plt.figure(figsize = (10,12))
    plt.imshow(imgs[index][0], aspect='auto')
    plt.show()

interactive(children=(IntSlider(value=0, description='index', max=1), Output()), _dom_classes=('widget-interac…

In [8]:
import ipywidgets as widgets


In [74]:
import json

with open('results-glad-gorge-6-8451-lmv2-35-9.json', 'r') as f:
    data = json.load(f)

In [80]:
for k in data:
    if k[-2:] == 'f1':
        print(f'{k}: {data[k]}')

eval/сего к оплате_f1: 0
eval/ИК банка получателя_f1: 1
eval/оставщик_f1: 0.9411764705882352
eval/стальное_f1: 0.8263473053892216
eval/окупатель_f1: 0.9333333333333332
eval/чет банка_f1: 0.823529411764706
eval/анк получателя_f1: 0.75
eval/омер счета_f1: 0.9333333333333332
eval/олучатель_f1: 0.5714285714285713
eval/НН получателя_f1: 1
eval/ата_f1: 0.8750000000000001
eval/overall_f1: 0.8394366197183099
eval/ПП получателя_f1: 1
eval/чет получателя_f1: 0.8888888888888888


In [None]:
"""
eval/Всего к оплате_f1: 0
eval/БИК банка получателя_f1: 1
eval/Поставщик_f1: 0.9411764705882352
eval/Остальное_f1: 0.8263473053892216
eval/Покупатель_f1: 0.9333333333333332
eval/Счет банка_f1: 0.823529411764706
eval/Банк получателя_f1: 0.75
eval/Номер счета_f1: 0.9333333333333332
eval/Получатель_f1: 0.5714285714285713
eval/ИНН получателя_f1: 1
eval/Дата_f1: 0.8750000000000001
eval/overall_f1: 0.8394366197183099
eval/КПП получателя_f1: 1
eval/Счет получателя_f1: 0.8888888888888888
"""