Check the outputs of LayoutLMv2 implementation.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install tensorflow-addons
!pip install transformers
!pip install pytesseract
!pip install tesseract
!pip install pyyaml==5.1
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html
!sudo apt update
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev
!git clone https://github.com/RookieZB/bert_implementation_by_tf2.git
!wget https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/pytorch_model.bin

In [None]:
import os
import time
import numpy as np
import tensorflow as tf
from PIL import Image
from transformers import LayoutLMv2Processor, LayoutLMv2Model
from bert_implementation_by_tf2.modules import layoutlm

In [None]:
doc_image = Image.open('./drive/My Drive/Python/Research/tasks/datasets/doc/test.PNG').convert('RGB')
layout_torch = LayoutLMv2Model.from_pretrained('microsoft/layoutlmv2-base-uncased')
feature_processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')

input_feature = feature_processor(doc_image, max_length=512, return_tensors='pt')
check_point = time.time()
print(layout_torch(**input_feature))
print(time.time()-check_point)

In [None]:
layout_model = layoutlm.LayoutLM('layoutlmv2-base')
layout_model.loading('pytorch_model.bin')

input_feature = feature_processor(doc_image, max_length=512, return_tensors='np')
check_point = time.time()
print(layout_model.propagating(
  tf.cast(np.transpose(input_feature['image'], [0, 2, 3, 1]), tf.float32),
  input_feature['input_ids'],
  input_feature['bbox'],
  input_feature['token_type_ids'],
  1.-input_feature['attention_mask'])[1])
print(time.time()-check_point)

Finetune LayoutLMv2 on FUNSD.
*   Need to download the dataset from https://guillaumejaume.github.io/FUNSD.
*   Data processing refers to https://github.com/microsoft/unilm/tree/master/layoutlmft.
*   Samples longer than max length are truncated for simplicity.

In [None]:
!pip install tensorflow-addons
!pip install transformers
!pip install seqeval
!git clone https://github.com/RookieZB/bert_implementation_by_tf2.git
!wget https://guillaumejaume.github.io/FUNSD/dataset.zip
!wget https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/pytorch_model.bin
!wget https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/vocab.txt
!unzip dataset.zip

In [None]:
import os
import time
import json
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from PIL import Image
from transformers import LayoutLMv2FeatureExtractor
from seqeval import metrics
from bert_implementation_by_tf2 import mymodels
from bert_implementation_by_tf2.modules import layoutlm, adamw

In [None]:
MODEL = 'layoutlmv2-base'
CKPT = 'pytorch_model.bin'
VOCAB = 'vocab.txt'
USECRF = False
MAXLEN = 512
DROP = 0.1
LRATE = 5e-5
BATCH = 6
EPOCH = 10

In [None]:
def bbox_normalizing(bbox, size):
  return [
    int(1000*bbox[0]/size[1]),
    int(1000*bbox[1]/size[0]),
    int(1000*bbox[2]/size[1]),
    int(1000*bbox[3]/size[0])]


def image_loading(path):
  i1 = np.array(Image.open(path).convert('RGB'))
  return i1, (i1.shape[0], i1.shape[1])


def data_loading(path, training=False):
  idx2, image2, tok2, box2, tag2, tag3, len2, mask2 = [], [], [], [], [], [], [], []

  for file1 in os.listdir(path+'/annotations'):
    ann1 = json.load(open(path+'/annotations/'+file1, 'rb'))
    image1, size1 = image_loading(path+'/images/'+file1.replace('json', 'png'))
    idx1, tok1, tag1, box1 = file1[:-5], [], [], []

    for item1 in ann1['form']:
      w1, label1 = item1['words'], item1['label']
      w1 = [i1 for i1 in w1 if i1['text'].strip() != '']

      if len(w1) == 0:
        continue
      
      if label1 == 'other':
        for i1 in w1:
          t1 = bert_tokenizer.encoding(i1['text'], bos=False, sep=False, pad=False)[0]
          tok1, tag1, box1 = tok1+t1, tag1+['O']*len(t1), box1+[bbox_normalizing(i1['box'], size1)]*len(t1)
          if np.max(bbox_normalizing(i1['box'], size1)) > 1000:
            print(size1)
            print(i1['box'])
            print('***')
      
      else:
        for r1, i1 in enumerate(w1):
          t1 = bert_tokenizer.encoding(i1['text'], bos=False, sep=False, pad=False)[0]
          b1 = [('I-' if r1 > 0 else 'B-')+label1.upper()]+['I-'+label1.upper()]*(len(t1)-1)
          tok1, tag1, box1 = tok1+t1, tag1+b1, box1+[bbox_normalizing(i1['box'], size1)]*len(t1)

    tok1 = [101]+tok1[:MAXLEN-2]+[102]
    len1, tok1 = len(tok1), tok1+[0]*(MAXLEN-len(tok1))
    box1 = [[0,0,0,0]]+box1[:MAXLEN-2]+[[0,0,0,0]]+[[0,0,0,0]]*(MAXLEN-len1)
    tag1 = ['O']+tag1[:MAXLEN-2]+['O']
    mask1 = [0]*len1+[1]*(MAXLEN-len1)
    image1 = feature_extractor(image1)['pixel_values']
    idx2, image2 = idx2+[idx1], image2+[image1]
    tok2, box2, mask2, len2 = tok2+[tok1], box2+[box1], mask2+[mask1], len2+[len1]
    tag2, tag3 = tag2+[tag1[1:-1]], tag3+[[label_map[a1] for a1 in tag1+['O']*(MAXLEN-len1)]]
  
  image2 = tf.cast(np.transpose(np.concatenate(image2, 0), [0, 2, 3, 1]), tf.float32)
  tok2 = tf.cast(np.array(tok2), tf.int32)
  box2 = tf.cast(np.array(box2), tf.int32)
  mask2 = tf.cast(np.array(mask2), tf.int32)
  tag3 = tf.cast(np.eye(len(label_map.keys()))[tag3], tf.float32)
  data1 = tf.data.Dataset.from_tensor_slices((image2, tok2, box2, mask2, tag3))
  data1 = data1.shuffle(tok2.shape[0]) if training else data1
  return idx2, tag2, data1.batch(BATCH)


label_map = {
  'O': 0,
  'B-HEADER': 1,
  'I-HEADER': 2,
  'B-QUESTION': 3,
  'I-QUESTION': 4,
  'B-ANSWER': 5,
  'I-ANSWER': 6}
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
bert_tokenizer = mymodels.Tokenizer(True, True, False)
bert_tokenizer.loading(VOCAB)
train_idx, train_label, train_set = data_loading('dataset/training_data', False)
dev_idx, dev_label, dev_set = data_loading('dataset/testing_data', False)

In [None]:
class LayoutSeq(keras.Model):
  def __init__(self, model, ckpt, drop, category, crf=False, maxlen=512):
    super(LayoutSeq, self).__init__()
    self.maxlen, self.layout = maxlen, layoutlm.LayoutLM(model)
    self.layout.loading(ckpt)
    self.drop = keras.layers.Dropout(drop)
    self.dense = keras.layers.Dense(category, None if crf else 'softmax')
    self.crf = mymodels.CRF(category) if crf else None

  def propagating(self, image, text, bbox, segment=None, mask=None, training=False):
    x1 = self.layout.propagating(image, text, bbox, segment, mask, training)[1]
    return self.dense(self.drop(x1[:, :self.maxlen, :], training=training))


class ModelSeq(object):
  def __init__(self, model, label):
    self.model = model
    self.crf = True if self.model.crf is not None else False
    self.map = list(label.keys())

  def predicting(self, image, text, bbox, mask):
    pred1 = self.model.propagating(image, text, bbox, mask=mask, training=False)
    collection1 = []

    if self.crf:
      for i1 in self.model.crf.decoding(pred1, mask):
        collection1.append([self.map[int(j1)] for j1 in i1][1:-1])

    else:
      for i1, seq1 in enumerate(np.argmax(pred1, 2).tolist()):
        mask1 = np.argmax(np.array(mask[i1]).tolist()+[1])
        collection1.append([self.map[j1] for j1 in seq1[:mask1]][1:-1])

    return collection1


@tf.function
def step_training(image, text, box, mask, y):
  with tf.GradientTape() as tape1:
    pred1 = layout_model.propagating(image, text, box, mask=mask, training=True)

    if USECRF:
      cal1 = layout_model.crf.calculating(y, pred1, mask)
      loss1 = tf.reduce_mean(cal1)
    else:
      mask1 = tf.cast((1-mask), tf.float32)
      loss1 = tf.reduce_sum(loss_function(y, pred1)*mask1)/tf.reduce_sum(mask1)
  
  grad1 = tape1.gradient(loss1, layout_model.trainable_variables)
  grad1, _ = tf.clip_by_global_norm(grad1, 1.0)
  adam_optimizer.apply_gradients(zip(grad1, layout_model.trainable_variables))
  train_loss(loss1)


def step_evaluating(data, label):
  pred1 = []

  for i1, t1, b1, m1, _ in data:
    pred1 += ner_model.predicting(i1, t1, b1, m1)

  return metrics.f1_score(label, pred1), metrics.classification_report(label, pred1)


layout_model = LayoutSeq(MODEL, CKPT, DROP, len(label_map.keys()), USECRF, MAXLEN)
ner_model = ModelSeq(layout_model, label_map)
adam_optimizer = adamw.AdamW(EPOCH*(len(train_idx)//BATCH+1), LRATE)
loss_function = keras.losses.CategoricalCrossentropy(reduction=keras.losses.Reduction.NONE)
train_loss = tf.keras.metrics.Mean(name='training_loss')

In [None]:
temp_a = 'Training loss is {:.4f}.'
temp_b = 'Dev F1 score is {:.4f}.'
batch_count = 0

for e_1 in range(EPOCH):
  print('Epoch {} running.'.format(e_1+1))

  for x_image, x_text, x_box, x_mask, y_tag in train_set:
    batch_count = batch_count+1
    step_training(x_image, x_text, x_box, x_mask, y_tag)

    if batch_count % 5 == 0:
      print(temp_a.format(float(train_loss.result())))

  f_score, _ = step_evaluating(dev_set, dev_label)
  print(temp_b.format(f_score))
  print('**********')

In [None]:
f_score, c_metrics = step_evaluating(dev_set, dev_label)
print(c_metrics)

Finetune LayoutLMv2 on DocVQA dataset, haven't finished.
*   Need to download the dataset from https://rrc.cvc.uab.es/.
*   Data processing refers to https://github.com/anisha2102/docvqa.

In [None]:
!pip install tensorflow-addons
!pip install transformers
!git clone https://github.com/RookieZB/bert_implementation_by_tf2.git
!wget https://huggingface.co/microsoft/layoutlmv2-base-uncased/resolve/main/pytorch_model.bin
!wget https://raw.githubusercontent.com/anisha2102/docvqa/master/create_dataset.py
!cp -r drive/MyDrive/Python/Research/tasks/data/doc_vqa ./
!tar -xvf doc_vqa/train.tar.gz
!tar -xvf doc_vqa/val.tar.gz
!tar -xvf doc_vqa/test.tar.gz

In [None]:
import os
import time
import json
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from transformers import LayoutLMv2Processor
from bert_implementation_by_tf2.modules import layoutlm, adamw

In [None]:
MODEL = 'layoutlmv2-base'
CKPT = 'pytorch_model.bin'
MAXLEN = 512
ANSLEN = 30
LRATE = 5e-5
EPOCH = 2
BATCH = 64
BEAM = 20

In [None]:
class LayoutQA(keras.Model):
  def __init__(self, model, ckpt, beam):
    super(LayoutQA, self).__init__()
    self.layout = layoutlm.LayoutLM(model)
    self.layout.loading(ckpt)
    self.beam, self.dim = beam, self.layout.param['hidden_size']
    self.startdense = keras.layers.Dense(1)
    self.mergedense = keras.layers.Dense(512, activation=layoutlm.gelu_activating)
    self.enddense = keras.layers.Dense(1)
    self.layout.emb.ve = self.layout.emb.add_weight(
      'layoutlmv2.visual_segment_embedding', [self.dim], None, keras.initializers.Zeros())
        
  def propagating(self, image, text, bbox, segment, mask, training=True, start=None):
    length1 = mask.shape[1]
    seq1 = self.layout.propagating(image, text, bbox, segment, 1.-mask, training)
    mask1 = tf.cast(mask*segment, tf.float32)+tf.one_hot(0, length1)
    start1 = self.startdense(seq1)[:, :, 0]+1000.0*(mask1-1)
    start2 = tf.nn.log_softmax(start1)
      
    if training:
      end0 = seq1
      index1 = tf.one_hot(start, depth=length1, axis=-1, dtype=tf.float32)
      feat1 = tf.reduce_sum(tf.expand_dims(index1, -1)*seq1, axis=1)
      feat1 = tf.tile(tf.expand_dims(feat1, 1), [1, length1, 1])
      end1 = tf.concat([feat1, end0], -1)
      end1 = self.enddense(self.mergedense(end1))[:, :, 0]
      end1 = end1+1000.0*(mask1-1)
      end2 = tf.nn.log_softmax(end1)
      return start2, end2
    else:
      prob0, index0 = tf.nn.top_k(start2, k=self.beam)
      end0 = tf.tile(tf.expand_dims(seq1, 1), [1, self.beam, 1, 1])
      index1 = tf.one_hot(index0, depth=length1, axis=-1, dtype=tf.float32)
      feat1 = tf.reduce_sum(tf.expand_dims(seq1, 1)*tf.expand_dims(index1, -1), axis=-2)
      feat1 = tf.tile(tf.expand_dims(feat1, 2), [1, 1, length1, 1])
      end1 = tf.concat([feat1, end0], -1)
      end1 = self.enddense(self.mergedense(end1))[:, :, :, 0]
      end1 = end1+tf.expand_dims(1000.0*(mask1-1), 1)
      end2 = tf.nn.log_softmax(end1)
      prob1, index1 = tf.nn.top_k(end2, k=self.beam)
      return start2, end2, prob0, index0, prob1, index1


def loss_computing(predstart, predend, start, end):
  length1 = predstart.shape[1]
  pos1 = tf.one_hot(start, depth=length1, dtype=tf.float32)
  loss1 = -tf.reduce_mean(tf.reduce_sum(pos1*predstart, axis=-1))
  pos2 = tf.one_hot(end, depth=length1, dtype=tf.float32)
  loss2 = -tf.reduce_mean(tf.reduce_sum(pos2*predend, axis=-1))
  return (loss1+loss2)/2.0


@tf.function
def step_training(image, text, bbox, segment, mask, start, end):
  with tf.GradientTape() as tape1:
    pred1, pred2 = layout_model.propagating(image, text, bbox, segment, mask, True, start)
    loss1 = loss_computing(pred1, pred2, start, end)

  grad1 = tape1.gradient(loss1, layout_model.trainable_variables)
  grad1, _ = tf.clip_by_global_norm(grad1, 1.0)
  adam_optimizer.apply_gradients(zip(grad1, layout_model.trainable_variables))
  train_loss(loss1)


layout_model = LayoutQA(MODEL, CKPT, BEAM)