### Importing Necessary Libraries

In [1]:
import os
import pickle
import gc

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array

from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, GlobalAvgPool2D, BatchNormalization
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.applications.resnet_v2 import preprocess_input
from tensorflow.keras.regularizers import l2

from tensorflow.keras.optimizers import Adam

from sklearn.metrics import precision_recall_fscore_support as score

## Data

A detailed discussion and visualization of the data can be seen in [here](Data-Overview.ipynb).

In [2]:
labels = pd.read_csv("../input/datathonindoml-2022/train_labels.csv")
images = ['../input/datathonindoml-2022/train/train/'+str(name)+'.tif' for name in labels['id']]
labels['images'] = images
labels = labels[['id', 'images', 'label']]

In [3]:
labels

Unnamed: 0,id,images,label
0,0,../input/datathonindoml-2022/train/train/0.tif,1
1,1,../input/datathonindoml-2022/train/train/1.tif,13
2,2,../input/datathonindoml-2022/train/train/2.tif,13
3,3,../input/datathonindoml-2022/train/train/3.tif,14
4,4,../input/datathonindoml-2022/train/train/4.tif,6
...,...,...,...
15995,15995,../input/datathonindoml-2022/train/train/15995...,2
15996,15996,../input/datathonindoml-2022/train/train/15996...,15
15997,15997,../input/datathonindoml-2022/train/train/15997...,3
15998,15998,../input/datathonindoml-2022/train/train/15998...,9


In [4]:
# test_labels = labels.sample(800).sort_values(by='id')
# labels = labels[~labels['id'].isin(list(test_labels['id']))]

In [5]:
# test_labels

In [6]:
# labels

In [7]:
# labels.to_csv('./Data/train_labels_1.csv', index=False)
# test_labels.to_csv('./Data/test_labels_1.csv', index=False)

In [8]:
class_labels = list(set(labels['label']))
class_names = [
    'letter', 'form', 'email', 'handwritten', 'advertisement', 'scientific report', 'scientific publication',
    'specification', 'file folder', 'news article', 'budget', 'invoice', 'presentation', 'questionnaire', 'resume',
    'memo'
]
label_names = pd.DataFrame({
    'labels': class_labels,
    'names': class_names
})

## Data Preparation

Since, there is a lot of image data instead of using the whole data at once in a tensor form, we would be using a data generator to prevent memory issues.

In [9]:
imgs = [tf.cast(img_to_array(load_img('../input/datathonindoml-2022/validation/validation/'+name)), dtype=tf.uint8) for name in os.listdir('../input/datathonindoml-2022/validation/validation')[:4]]
heights = [img.shape[0] for img in imgs]
widths = [img.shape[1] for img in imgs]
print("")
print("Mean Image Height:", np.mean(heights))
print("Mean Image Width:", np.mean(widths))
print("Min Image Height:", np.min(heights))
print("Min Image Width:", np.min(widths))

2022-10-08 06:01:26.222542: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-08 06:01:26.316765: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-08 06:01:26.317582: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-08 06:01:26.324987: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil


Mean Image Height: 1000.0
Mean Image Width: 765.0
Min Image Height: 1000
Min Image Width: 754


Observing the average image heights and widths, it is observed that setting image height to $1000$ and image width to $750$ is reseonable for training the model.

In [10]:
BATCH_SIZE = 40
INPUT_SHAPE = (800, 600) # Full Image
INPUT_HDR_SHAPE = INPUT_FTR_SHAPE = (250, 600)
INPUT_BDL_SHAPE = INPUT_BDR_SHAPE = (300, 300)

Since, the Keras' own data generator does not support TIFF images, we resort to writing our own custom data generator. It can also be leveraged to put custom image pre-processing or multi-input or output in the data processing pipeline.

In [11]:
class ImageDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, df, X_col, y_col, batch_size, input_size=(800, 600), shuffle=True):
        self.df = df.copy() # DataFrame consisting image paths of inputs and the labels for the outputs
        self.X_col = X_col # Input column, specifying image path, in the DataFrame
        self.y_col = y_col # Output column, specifying corresponding label, in the DataFrame
        self.batch_size = batch_size # Batch Size
        self.input_size = input_size # Input Image size
        self.shuffle = shuffle # Shuffle Data after each epoch
        self.n = len(self.df) # length of the entire data
    
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def __get_input(self, path):
        img_arr = img_to_array(load_img(path))
        # some other pre-processing / data-augmentation goes here
        img_arr = tf.image.resize(img_arr, self.input_size)
        img_hdr, img_bdl, img_bdr, img_ftr = img_arr[:250], img_arr[250:-250, :300], img_arr[250:-250, -300:], img_arr[-250:]
        return [preprocess_input(img_hdr), preprocess_input(img_bdl), preprocess_input(img_bdr), preprocess_input(img_ftr), preprocess_input(img_arr)]
    
    def __get_data(self, batches):
        batch_paths = batches[self.X_col]
        batch_labels = batches[self.y_col]
        X_batch_4 = [self.__get_input(path) for path in batch_paths]
        X_batch = tf.cast([img[0] for img in X_batch_4], dtype=tf.float32), tf.cast([img[1] for img in X_batch_4], dtype=tf.float32), tf.cast([img[2] for img in X_batch_4], dtype=tf.float32), tf.cast([img[3] for img in X_batch_4], dtype=tf.float32), tf.cast([img[4] for img in X_batch_4], dtype=tf.float32)
        y_batch = tf.cast(batch_labels, dtype=tf.float32)
        return X_batch, y_batch
    
    def __getitem__(self, index):
        batches = self.df[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.__get_data(batches)
        return X, y
    
    def __len__(self):
        return self.n // self.batch_size

In [12]:
train_generator = ImageDataGenerator(
    df=labels,
    X_col='images',
    y_col='label',
    batch_size=BATCH_SIZE,
    input_size=INPUT_SHAPE,
    shuffle=False
)

Below shows one batch from the ImageDataGenerator.

In [13]:
batch = next(iter(train_generator))
print('*** Batch Overview ***')
print('Number of Inputs:', len(batch[0]))
print('Input Image Header Tensor Shape:', batch[0][0].shape)
print('Input Image Body(L) Tensor Shape:', batch[0][1].shape)
print('Input Image Body(R) Tensor Shape:', batch[0][2].shape)
print('Input Image Footer Tensor Shape:', batch[0][3].shape)
print('Input Image Full Tensor Shape:', batch[0][4].shape)
print('Output Label Tensor Shape:', batch[1].shape)

*** Batch Overview ***
Number of Inputs: 5
Input Image Header Tensor Shape: (40, 250, 600, 3)
Input Image Body(L) Tensor Shape: (40, 300, 300, 3)
Input Image Body(R) Tensor Shape: (40, 300, 300, 3)
Input Image Footer Tensor Shape: (40, 250, 600, 3)
Input Image Full Tensor Shape: (40, 800, 600, 3)
Output Label Tensor Shape: (40,)


Each row below shows the $4$ parts of the same image and the full image, where the first being the header, second being the body(left), third is body(right) and last one is footer and the full image.

In [14]:
# fig, ax = plt.subplots(BATCH_SIZE//2, 5, figsize=(12, 60))
# imgs = batch[0]
# labs = list(batch[1].numpy())
# for j in range(BATCH_SIZE//2):
#     for k in range(5):
#         ax[j, k].imshow((imgs[k][j]*255).numpy().astype(np.uint8))
#         ax[j, k].axis('off')
#         ax[j, k].set_title(labs[j])

## Model Building

Now, a basic CNN-based model will be put to place. It will utilize the visual features only to classify the documents. Later we will be building much more better models considering other structures and multi-modality of the images and distinctive features.

- ResNet50V2

The ResNet50V2 is a large model and since, we are not pre-training the entire model, we will just fine-tune it with two extra layers. So, to fasten training we precompute the output of the ResNet50V2 model and use this for training the added Dense Layers for Fine-Tuning.

In [15]:
model_resnet = ResNet50V2(include_top=False, weights='imagenet')
model_resnet.trainable = False

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50v2_weights_tf_dim_ordering_tf_kernels_notop.h5


In [16]:
inputs = [Input(shape=(*INPUT_HDR_SHAPE, 3)), Input(shape=(*INPUT_BDL_SHAPE, 3)), Input(shape=(*INPUT_BDR_SHAPE, 3)), Input(shape=(*INPUT_FTR_SHAPE, 3)), Input(shape=(*INPUT_SHAPE, 3))]
outputs = [model_resnet(inp) for inp in inputs]
outputs = [GlobalAvgPool2D()(out) for out in outputs]
model = tf.keras.Model(inputs, outputs, name='ResNet-4Piece')

In [17]:
model.summary()

Model: "ResNet-4Piece"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 250, 600, 3) 0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 300, 300, 3) 0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 300, 300, 3) 0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 250, 600, 3) 0                                            
______________________________________________________________________________________

In [18]:
precompute_ResNet_output = 0
for idx in range(len(labels)//1600):
    print(f"Batch-{idx+1}")
    train_generator_precompute = ImageDataGenerator(
        df=labels[idx*1600:(idx+1)*1600],
        X_col='images',
        y_col='label',
        batch_size=BATCH_SIZE,
        input_size=INPUT_SHAPE,
        shuffle=False
    )
    precompute_ResNet_output = model.predict(train_generator_precompute)
    tf.keras.backend.clear_session()
    _ = gc.collect()
    np.save(f'./train_data_precomp-{idx}.npy', precompute_ResNet_output)

Batch-1


2022-10-08 06:01:34.096479: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-10-08 06:01:38.757166: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Batch-2
Batch-3
Batch-4
Batch-5
Batch-6
Batch-7
Batch-8
Batch-9
Batch-10


In [19]:
# fine_tune_data = np.concatenate([np.transpose(np.load(f'./Data/EffNet-4P-train-precompute/train_data_precomp-{idx}.npy'), (1,0,2)) for idx in range(10)])
# np.save(f'./Data/EffNet-4P-train-precompute/whole_train_data_precomp.npy', fine_tune_data)
# fine_tune_data.shape

The EfficientNetV2L gives a 3D output per image with $1280$ channels which are collapsed using GlobalAveragePooling2D and we get a 1280 representation vector per 4 parts of the image and full image ($4+1=5$) which will be used to train the later extra layers of our model.

#### On Validation Data

Generating Predictions for the Validation Set

In [20]:
class ImageDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, df, X_col, batch_size, input_size=(800, 600), shuffle=True):
        self.df = df.copy() # DataFrame consisting image paths of inputs and the labels for the outputs
        self.X_col = X_col # Input column, specifying image path, in the DataFrame
        self.batch_size = batch_size # Batch Size
        self.input_size = input_size # Input Image size
        self.shuffle = shuffle # Shuffle Data after each epoch
        self.n = len(self.df) # length of the entire data
    
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def __get_input(self, path):
        img_arr = img_to_array(load_img(path))
        # some other pre-processing / data-augmentation goes here
        img_arr = tf.image.resize(img_arr, self.input_size)
        img_hdr, img_bdl, img_bdr, img_ftr = img_arr[:250], img_arr[250:-250, :300], img_arr[250:-250, -300:], img_arr[-250:]
        return [preprocess_input(img_hdr), preprocess_input(img_bdl), preprocess_input(img_bdr), preprocess_input(img_ftr), preprocess_input(img_arr)]
    
    def __get_data(self, batches):
        batch_paths = batches[self.X_col]
        X_batch_4 = [self.__get_input(path) for path in batch_paths]
        X_batch = tf.cast([img[0] for img in X_batch_4], dtype=tf.float32), tf.cast([img[1] for img in X_batch_4], dtype=tf.float32), tf.cast([img[2] for img in X_batch_4], dtype=tf.float32), tf.cast([img[3] for img in X_batch_4], dtype=tf.float32), tf.cast([img[4] for img in X_batch_4], dtype=tf.float32)
        return (X_batch, )
    
    def __getitem__(self, index):
        batches = self.df[index * self.batch_size:(index + 1) * self.batch_size]
        X = self.__get_data(batches)
        return X
    
    def __len__(self):
        return self.n // self.batch_size

In [21]:
validation_data = pd.DataFrame({'images':['../input/datathonindoml-2022/validation/validation/'+name for name in os.listdir('../input/datathonindoml-2022/validation/validation/')]})
validation_data['id'] = [name.split('.')[0] for name in os.listdir('../input/datathonindoml-2022/validation/validation/')]
validation_data['label'] = -1 # Simply added to prevent re-writing code
validation_data.sort_values(by=['id'], inplace=True)
validation_data

Unnamed: 0,images,id,label
57,../input/datathonindoml-2022/validation/valida...,17801,-1
141,../input/datathonindoml-2022/validation/valida...,17802,-1
777,../input/datathonindoml-2022/validation/valida...,17803,-1
273,../input/datathonindoml-2022/validation/valida...,17804,-1
30,../input/datathonindoml-2022/validation/valida...,17805,-1
...,...,...,...
396,../input/datathonindoml-2022/validation/valida...,18696,-1
111,../input/datathonindoml-2022/validation/valida...,18697,-1
817,../input/datathonindoml-2022/validation/valida...,18698,-1
710,../input/datathonindoml-2022/validation/valida...,18699,-1


In [22]:
valid_generator = ImageDataGenerator(
    df=validation_data,
    X_col='images',
    batch_size=30,
    shuffle=False
)

In [23]:
valid_preds = model.predict(valid_generator)
valid_preds = np.transpose(np.concatenate([dat[np.newaxis,:,:] for dat in valid_preds]), (1,0,2))

In [24]:
valid_preds.shape

(900, 5, 2048)

In [25]:
np.save('./whole_validation_data_precomp.npy', valid_preds)