In [1]:
import os
import pickle
from tqdm import tqdm
import gc

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import layoutparser as lp
import cv2

import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array

from tensorflow import keras
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, GlobalAvgPool2D, GlobalAvgPool1D, BatchNormalization, MultiHeadAttention, Layer, LayerNormalization, Concatenate, MaxPool1D
from tensorflow.keras.applications import InceptionResNetV2
from tensorflow.keras.applications.inception_resnet_v2 import preprocess_input
from tensorflow.keras.regularizers import l2

from tensorflow.keras.optimizers import Adam

from sklearn.metrics import precision_recall_fscore_support as score

### SET THE PATHS

Set the following paths:
- `MODEL_PATH`: The directory which contains the trained model weights.
- `DATA_PATH`: The directory path which contains the images.
- `PREDICTED_LABELS_PATH`: The directory where the predicted labels are to be saved.

In [2]:
MODEL_PATH = './Pretrained-Model'
DATA_PATH = './Data/content/data/noise-added'
PREDICTED_LABELS_PATH = './Data/content'

### SET FEW MORE PARAMETERS

Set the parameters:
- `BATCH_SIZE`: Must divide the number of images in the `DATA_PATH` directory.
- `CHECKPOINT_EACH`: Must be a multiple of `BATCH_SIZE` and must divide the number of images in the `DATA_PATH` directory.

**Example**:
Suppose `DATA_PATH` folder contains $140$ images. So, `BATCH_SIZE` can be chosen to be $70$ as it divides the number of images in the `DATA_PATH` folder. `CHECKPOINT_EACH` can be chosen to be $70$ or $140$ (not $210$ or above as $210$ > $140$) ideally it should be greater than `BATCH_SIZE`.

Warning:
Too high a `BATCH_SIZE` can run you into Memory Issues.

In [3]:
BATCH_SIZE = 110
CHECKPOINT_EACH = 110

Once done, run all cells.

#### NOTHING TO CHANGE BELOW, SIMPLY RUN

In [4]:
INPUT_SHAPE_ROI = (400, 300)
MAX_ROIS = 100
INPUT_SIZE = (800, 600)
ROI_PAD = 5
FEATURE_DIM = 1536
INPUT_SHAPE = (800, 600)
INPUT_HDR_SHAPE = INPUT_FTR_SHAPE = (250, 600)
INPUT_BDL_SHAPE = INPUT_BDR_SHAPE = (300, 300)
POS_ENC_ANGLE_DENO = 10000
NUM_HEADS = 4
FF_DIM = FEATURE_DIM // 4
assert CHECKPOINT_EACH % BATCH_SIZE == 0
assert CHECKPOINT_EACH >= BATCH_SIZE
checkpnt_i = CHECKPOINT_EACH // BATCH_SIZE

In [5]:
roi_type_to_num = {"TextRegion":1, "ImageRegion":2, "TableRegion":3, "MathsRegion":4, "SeparatorRegion":5, "OtherRegion":6}

def per_document(doc_data, MAX_ROIS=MAX_ROIS):
    if (len(doc_data[0]) == 0): 
        non_roi_data = np.array([[0, 0, 0, 0, 0]]*MAX_ROIS)
        return non_roi_data[np.newaxis, :]
    x = np.array(doc_data[0])
    top_down = np.argsort(x[:, 0])
    x = x[top_down]
    if x.shape[0] > 100:
        x = x[:100]
        pad_mask = np.array([[1]*MAX_ROIS])
    else:
        x = np.pad(x, pad_width=((0,MAX_ROIS-x.shape[0]),(0,0)), constant_values=0)
        pad_mask = np.array([[1]*len(doc_data[0])+[0]*(MAX_ROIS-len(doc_data[0]))])
    doc = np.concatenate([x, pad_mask.T], axis=-1)
    return doc[np.newaxis, :]

def ROI_Extractor(image_path):
    img = cv2.imread(image_path)
    layout = model_roi_extracter.detect(img)
    roi_coords = list()
    for block in layout:
        roi_coords.append(block.coordinates)
    return per_document([roi_coords])

def split_4_pieces(image_path):
    img_arr = img_to_array(load_img(image_path))
    # some other pre-processing / data-augmentation goes here
    img_arr = tf.image.resize(img_arr, INPUT_SIZE)
    img_hdr, img_bdl, img_bdr, img_ftr = img_arr[:250], img_arr[250:-250, :300], img_arr[250:-250, -300:], img_arr[-250:]
    return [preprocess_input(img_hdr)[tf.newaxis,:], preprocess_input(img_bdl)[tf.newaxis,:], preprocess_input(img_bdr)[tf.newaxis,:], preprocess_input(img_ftr)[tf.newaxis,:], preprocess_input(img_arr)[tf.newaxis,:]]


def resize_and_pad_with_doc_max_col(img, f_target_height=INPUT_SHAPE_ROI[0], f_target_width=INPUT_SHAPE_ROI[1]):
    f_width = img.shape[1]
    f_height = img.shape[0]
    ratio = np.max([f_width / f_target_width, f_height / f_target_height])
    resized_height_float = f_height / ratio
    resized_width_float = f_width / ratio
    resized_height = tf.cast(
        np.floor(resized_height_float), dtype=tf.int32)
    resized_width = tf.cast(
        np.floor(resized_width_float), dtype=tf.int32)

    padding_height = (f_target_height - resized_height_float) / 2
    padding_width = (f_target_width - resized_width_float) / 2
    f_padding_height = np.floor(padding_height)
    f_padding_width = np.floor(padding_width)
    p_height = np.max([0, tf.cast(f_padding_height, dtype=tf.int32)])
    p_width = np.max([0, tf.cast(f_padding_width, dtype=tf.int32)])
    if f_height < 10 or f_width < 10:
      img = tf.image.resize_with_pad(img, max(10, f_height), max(10, f_width))
    resized_padded_image = tf.image.resize_with_pad(img, f_target_height, f_target_width).numpy()
    white_color = np.max(resized_padded_image)
    resized_padded_image[:p_height, :] = white_color
    resized_padded_image[(f_target_height-p_height-1):, :] = white_color
    resized_padded_image[:, :p_width] = white_color
    resized_padded_image[:, (f_target_width-p_width-1):] = white_color
    return resized_padded_image

def generate_roi_info(image_path):
    image_data = img_to_array(load_img(image_path))
    roi_info = ROI_Extractor(image_path)[0]
    return roi_info

def split_roi_pieces(image_path):
    image_data = img_to_array(load_img(image_path))
    roi_data = ROI_Extractor(image_path)[0]
    return np.array([preprocess_input(resize_and_pad_with_doc_max_col(image_data[max(0, int(y_top)-ROI_PAD):min(image_data.shape[0], int(y_bottom)+ROI_PAD), max(0, int(x_top)-ROI_PAD):min(image_data.shape[1], int(x_bottom)+ROI_PAD)])) for x_top, y_top, x_bottom, y_bottom, pad_mask in roi_data if pad_mask == 1])

def rois_to_feature_vecs(roi_data):
    roi_feature_vecs = model_roi.predict(tf.cast(roi_data, dtype=tf.float32))
    _ = gc.collect()
    tf.keras.backend.clear_session()
    padded_roi_feature_vecs = np.concatenate([roi_feature_vecs, np.zeros((MAX_ROIS-roi_feature_vecs.shape[0], FEATURE_DIM))])[np.newaxis,:]
    return padded_roi_feature_vecs

def piece4_to_feature_vecs(pieces):
    pieces = [tf.cast(piece, dtype=tf.float32) for piece in pieces]
    pieces_feature_vecs = model_4P.predict(pieces)
    _ = gc.collect()
    tf.keras.backend.clear_session()
    return np.transpose(np.concatenate([pieces_feature_vecs]), (1,0,2))

def pos_enc(max_len=MAX_ROIS, d_model=FEATURE_DIM):
    angles = np.arange(max_len)[:, np.newaxis] / np.power(POS_ENC_ANGLE_DENO, 2*(np.arange(d_model)[np.newaxis, :]//2/np.float32(d_model)))
    pos_encode = np.zeros((max_len, d_model))
    pos_encode[:, 0::2] = np.sin(angles[:, 0::2])
    pos_encode[:, 1::2] = np.cos(angles[:, 1::2])
    return tf.cast(pos_encode[np.newaxis, :], dtype=tf.float32)

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, kernel_regularizer=l2(5e-5))
        self.ffn = keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, mask, training):
        attn_output = self.att(inputs, inputs, inputs, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
def ftmodel():
    inputs = Input(shape=(5*FEATURE_DIM))
    x = Flatten()(inputs)
    x = BatchNormalization()(x)
    
    inputs_roi = Input(shape=(100, FEATURE_DIM))
    inputs_roi_mask = Input(shape=(100, 100))
    y = inputs_roi + pos_enc()
    y = MaxPool1D(strides=4, data_format='channels_first')(inputs_roi)
    y = TransformerBlock(y.shape[-1], NUM_HEADS, FF_DIM)(y, inputs_roi_mask)
    y = GlobalAvgPool1D()(y)
    
    x = Concatenate()([x, y])
    
    top_dropout_rate = 0.3
    x = Dropout(top_dropout_rate, name='top_dropout_2')(x)
    x = Dense(512, name='top_dense_2', kernel_initializer="he_normal", kernel_regularizer=l2(5e-5), activation='relu')(x)
    x = BatchNormalization()(x)
    outputs = Dense(16, activation='softmax', name='pred', kernel_regularizer=l2(5e-5))(x)

    model1 = tf.keras.Model([inputs, inputs_roi, inputs_roi_mask], outputs, name='Inception-ResNet-4Piece-Vision-Transformer')
    return model1

def generate_mask(roi_data):
    mask = np.zeros((1, MAX_ROIS, MAX_ROIS))
    mask[0,:roi_data.shape[0], :roi_data.shape[0]] = 1
    return mask

class ImageDataGenerator:
    def __init__(self, df, X_col, roi_info, y_col, batch_size, roi_pad=5, input_size=(800, 600), shuffle=True, base=0):
        self.df = df.copy()
        self.X_col = X_col
        self.roi_info = roi_info
        self.y_col = y_col 
        self.batch_size = batch_size
        self.roi_pad = roi_pad 
        self.input_size = input_size 
        self.shuffle = shuffle 
        self.n = len(self.df) 
        self.idx=base 
    
    def __get_input(self, path):
        image_data = img_to_array(load_img(path)) 
        roi_data = self.roi_info[self.idx]
        self.idx += 1
        return [preprocess_input(resize_and_pad_with_doc_max_col(image_data[max(0, int(y_top)-self.roi_pad):min(image_data.shape[0], int(y_bottom)+self.roi_pad), max(0, int(x_top)-self.roi_pad):min(image_data.shape[1], int(x_bottom)+self.roi_pad)])) for x_top, y_top, x_bottom, y_bottom, pad_mask in roi_data if pad_mask == 1]

    def __get_data(self, batches):
        batch_paths = batches[self.X_col]
        batch_labels = batches[self.y_col]
        X_batch = [np.array(self.__get_input(path)) for path in batch_paths]
        y_batch = tf.cast(batch_labels, dtype=tf.float32)
        return X_batch, y_batch
    
    def __getitem__(self, index):
        self.idx = index * self.batch_size
        batches = self.df[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.__get_data(batches)
        return X, y
    
    def __len__(self):
        return self.n // self.batch_size
    
class ImageDataGenerator_4P(tf.keras.utils.Sequence):
    def __init__(self, df, X_col, batch_size, input_size=(800, 600), shuffle=True):
        self.df = df.copy()
        self.X_col = X_col
        self.batch_size = batch_size
        self.input_size = input_size
        self.shuffle = shuffle
        self.n = len(self.df)
    
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def __get_input(self, path):
        img_arr = img_to_array(load_img(path))
        img_arr = tf.image.resize(img_arr, self.input_size)
        img_hdr, img_bdl, img_bdr, img_ftr = img_arr[:250], img_arr[250:-250, :300], img_arr[250:-250, -300:], img_arr[-250:]
        return [preprocess_input(img_hdr), preprocess_input(img_bdl), preprocess_input(img_bdr), preprocess_input(img_ftr), preprocess_input(img_arr)]
    
    def __get_data(self, batches):
        batch_paths = batches[self.X_col]
        X_batch_4 = [self.__get_input(path) for path in batch_paths]
        X_batch = tf.cast([img[0] for img in X_batch_4], dtype=tf.float32), tf.cast([img[1] for img in X_batch_4], dtype=tf.float32), tf.cast([img[2] for img in X_batch_4], dtype=tf.float32), tf.cast([img[3] for img in X_batch_4], dtype=tf.float32), tf.cast([img[4] for img in X_batch_4], dtype=tf.float32)
        return (X_batch, )
    
    def __getitem__(self, index):
        batches = self.df[index * self.batch_size:(index + 1) * self.batch_size]
        X = self.__get_data(batches)
        return X
    
    def __len__(self):
        return self.n // self.batch_size

def generate_roi_infos(image_paths):
    rois_info = list()
    for image_path in tqdm(image_paths):
        rois_info.append(generate_roi_info(image_path)[np.newaxis,:])
    return np.concatenate(rois_info)

def generate_mask_all(roi_data, roi_count):
    mask = np.zeros((*roi_data.shape[:2], roi_data.shape[1]))
    k = 0
    for num_roi in roi_count:
        mask[k,:num_roi,:num_roi] = 1
        k += 1
    return mask

def final_model(image_path):
    rois = split_roi_pieces(image_path)
    pieces = split_4_pieces(image_path)
    roi_features = rois_to_feature_vecs(rois)
    pieces_features = piece4_to_feature_vecs(pieces)
    pieces_features = np.reshape(pieces_features, (1, -1))
    mask = generate_mask(roi_features)
    out_probs = model_4P_ViT.predict([tf.cast(pieces_features, dtype=tf.float32), tf.cast(roi_features, dtype=tf.float32), tf.cast(mask, dtype=tf.float32)])
    _ = gc.collect()
    tf.keras.backend.clear_session()
    pred_label = np.argmax(out_probs, axis=-1)
    return pred_label[0]

In [6]:
model_roi_extracter = lp.Detectron2LayoutModel('lp://PrimaLayout/mask_rcnn_R_50_FPN_3x/config',
                                 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
                                 label_map={1:"TextRegion", 2:"ImageRegion", 3:"TableRegion", 4:"MathsRegion", 5:"SeparatorRegion", 6:"OtherRegion"})

model_inception_resnet = InceptionResNetV2(include_top=False, weights='imagenet')
model_inception_resnet.trainable = False
inp = Input(shape=(*INPUT_SHAPE_ROI, 3))
out = model_inception_resnet(inp, training=False)
output = GlobalAvgPool2D()(out)
model_roi = tf.keras.Model(inp, output, name='Inception-ResNet-kPiece')
model_roi.compile(loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'], optimizer = Adam(learning_rate = 0.0003))

inputs = [Input(shape=(*INPUT_HDR_SHAPE, 3)), Input(shape=(*INPUT_BDL_SHAPE, 3)), Input(shape=(*INPUT_BDR_SHAPE, 3)), Input(shape=(*INPUT_FTR_SHAPE, 3)), Input(shape=(*INPUT_SHAPE, 3))]
outputs = [model_inception_resnet(inp) for inp in inputs]
outputs = [GlobalAvgPool2D()(out) for out in outputs]
model_4P = tf.keras.Model(inputs, outputs, name='Inception-ResNet-4Piece')

model_4P_ViT = ftmodel()
model_4P_ViT.compile(loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'], optimizer = Adam(learning_rate = 0.0003))

Metal device set to: Apple M1 Max


2022-11-17 23:29:54.476867: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-17 23:29:54.476988: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [7]:
def test(model_path, data_path, pred_path):
    data = pd.DataFrame({'images':[os.path.join(data_path, name) for name in os.listdir(data_path)]})
    data['id'] = [name.split('.')[0] for name in os.listdir(data_path)]
    data.sort_values(by=['id'], inplace=True)
    data['label'] = -1
    image_paths = list(data['images'])
    print('Images found:', len(image_paths))
    print('Extracting ROIs from each image...')
    roi_infos = generate_roi_infos(image_paths)
    prev_crash_i = 0
    print('Converting ROIs to feature vectors, some temporary files will be created in current directory...')
    precompute_output = list()
    roi_generator = ImageDataGenerator(df=data, X_col='images', roi_info=roi_infos, y_col='label', batch_size=BATCH_SIZE, input_size=INPUT_SHAPE_ROI, shuffle=False)
    for i in tqdm(range(prev_crash_i, len(image_paths)//BATCH_SIZE)):
        batch = roi_generator[i]
        batch = [x for x in batch[0] if len(x.shape) == 4]
        batch = tf.concat(batch, axis=0)
        precompute_output.append(model_roi.predict(batch))
        tf.keras.backend.clear_session()
        _ = gc.collect()
        if (i+1) % checkpnt_i == 0:
            precompute_output = tf.concat(precompute_output, axis=0).numpy()
            np.save(f'{(i+1)//checkpnt_i}.npy', precompute_output)
            precompute_output = list()
            del batch
    roi_count = [np.sum([roi[-1]==1 for roi in doc]) for doc in roi_infos]
    roi_data = list()
    for i in range(len(image_paths)// (BATCH_SIZE*checkpnt_i)):
        k = 0
        batch_data = np.load(f'{i+1}.npy')
        batch_roi = roi_count[(i*(BATCH_SIZE*checkpnt_i)):((i+1)*(BATCH_SIZE*checkpnt_i))]
        for j in range(BATCH_SIZE*checkpnt_i):
            roi_data.append(np.concatenate([batch_data[k:(k+batch_roi[j])], np.zeros((MAX_ROIS-batch_roi[j], FEATURE_DIM))])[np.newaxis,:])
            k += batch_roi[j]
    roi_data = tf.cast(np.concatenate(roi_data), dtype=tf.float32)
    for i in range(1, len(image_paths)//(BATCH_SIZE*checkpnt_i)+1):
        os.remove(f'{i}.npy')
    print("Done.. Extracted ROIs and Generating Feature Vectors...")
    print('Extracting 4+1 pieces from each image and converting to feature vectors...')
    pieces_generator = ImageDataGenerator_4P(df=data, X_col='images', batch_size=BATCH_SIZE, shuffle=False)
    pieces_data = model_4P.predict(pieces_generator)
    pieces_data = tf.cast(np.reshape(np.transpose(np.concatenate([dat[np.newaxis,:,:] for dat in pieces_data]), (1,0,2)), (len(image_paths), -1)), dtype=tf.float32)
    print('Loading Trained RoI Vision Transformer Network weights...')
    model_4P_ViT.load_weights(os.path.join(model_path, 'auto-Inception-ResNet-FT-model-weight'))
    mask = tf.cast(generate_mask_all(roi_data, roi_count), dtype=tf.float32)
    print('Predicting Labels...')
    out_probs = model_4P_ViT.predict([pieces_data, roi_data, mask])
    pred_labels = np.argmax(out_probs, axis=-1)
    results = pd.DataFrame({'id':list(data['id']), 'label':pred_labels})
    results.to_csv(os.path.join(pred_path, 'predicted_label.csv'), index=False)

In [8]:
test(MODEL_PATH, DATA_PATH, PREDICTED_LABELS_PATH)

Images found: 880
Extracting ROIs from each image...


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
100%|█████████████████████████████████████████| 880/880 [12:11<00:00,  1.20it/s]


Converting ROIs to feature vectors, some temporary files will be created in current directory...


  0%|                                                     | 0/8 [00:00<?, ?it/s]2022-11-17 23:42:36.762095: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-11-17 23:42:37.577546: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




 12%|█████▋                                       | 1/8 [00:40<04:45, 40.85s/it]2022-11-17 23:43:22.531441: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




 25%|███████████▎                                 | 2/8 [01:30<04:34, 45.83s/it]2022-11-17 23:44:12.103847: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




 38%|████████████████▉                            | 3/8 [02:17<03:53, 46.66s/it]2022-11-17 23:44:52.918839: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




 50%|██████████████████████▌                      | 4/8 [02:54<02:50, 42.73s/it]2022-11-17 23:45:29.101100: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




 62%|████████████████████████████▏                | 5/8 [03:28<01:58, 39.60s/it]2022-11-17 23:46:12.472953: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




 75%|█████████████████████████████████▊           | 6/8 [04:16<01:24, 42.34s/it]2022-11-17 23:47:04.942920: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




 88%|███████████████████████████████████████▍     | 7/8 [05:11<00:46, 46.45s/it]2022-11-17 23:48:06.619106: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




100%|█████████████████████████████████████████████| 8/8 [06:14<00:00, 46.87s/it]


Done.. Extracted ROIs and Generating Feature Vectors...
Extracting 4+1 pieces from each image and converting to feature vectors...


2022-11-17 23:48:38.382787: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Loading Trained RoI Vision Transformer Network weights...
Predicting Labels...


2022-11-17 23:50:41.901571: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


