# Run on jupyter notebook  (for colab some code need to be changed)

**research paper: 
(Classification on Tobaco3482 using VGG-16 Transfer Learning with document pretraining on RVL-CDIP)**

M. Z. Afzal et al., “Cutting the Error by Half: Investigation
of Very Deep CNN and Advanced Training Strategies for
Document Image Classification,” in ICDAR, Nov. 2017.

In [1]:
import numpy as np
import pandas as pd
import os
import pathlib

# Train dataload

In [64]:
train_text_path = r"D:\Glove model\RVL-CDIP dataset\labels\train.txt"
train_df = pd.read_csv(train_text_path, sep=" ", header=None, names=["image_path", "label"])

In [65]:
train_df.head()
# print(len(train_df)) #320000

Unnamed: 0,image_path,label
0,imagesq/q/o/c/qoc54c00/80035521.tif,15
1,imagese/e/w/c/ewc23d00/513280028.tif,1
2,imagesw/w/b/t/wbt26e00/2053453161.tif,7
3,imagesm/m/k/m/mkm05e00/2040792992_2040792994.tif,10
4,imageso/o/e/x/oex80d00/522787731+-7732.tif,3


In [66]:
train_df['label'] = [str(i) for i in train_df['label']]

train_df['image_path'] = [r'D:/Glove model/RVL-CDIP dataset/images/' + this_path for this_path in train_df['image_path']]
train_df.head()

Unnamed: 0,image_path,label
0,D:/Glove model/RVL-CDIP dataset/images/imagesq...,15
1,D:/Glove model/RVL-CDIP dataset/images/imagese...,1
2,D:/Glove model/RVL-CDIP dataset/images/imagesw...,7
3,D:/Glove model/RVL-CDIP dataset/images/imagesm...,10
4,D:/Glove model/RVL-CDIP dataset/images/imageso...,3


# Test dataload

In [5]:
test_text_path = r"D:\Glove model\RVL-CDIP dataset\labels\test.txt"
test_df = pd.read_csv(test_text_path, sep=" ", header=None, names=["image_path", "label"])

In [6]:
test_df.head()
# print(len(test_df)) # 40000

Unnamed: 0,image_path,label
0,imagesr/r/g/e/rge31d00/503210033+-0034.tif,3
1,imagesc/c/e/j/cej80d00/517306722+-6724.tif,3
2,imagesm/m/r/r/mrr36d00/50603620-3621.tif,14
3,imagesg/g/t/u/gtu29c00/2084573574a.tif,2
4,imagesh/h/o/f/hof08d00/2071783492.tif,9


In [7]:
test_df['label'] = [str(i) for i in test_df['label']]

test_df['image_path'] = [r'D:/Glove model/RVL-CDIP dataset/images/' + this_path for this_path in test_df['image_path']]
test_df.head()

Unnamed: 0,image_path,label
0,D:/Glove model/RVL-CDIP dataset/images/imagesr...,3
1,D:/Glove model/RVL-CDIP dataset/images/imagesc...,3
2,D:/Glove model/RVL-CDIP dataset/images/imagesm...,14
3,D:/Glove model/RVL-CDIP dataset/images/imagesg...,2
4,D:/Glove model/RVL-CDIP dataset/images/imagesh...,9


# Validation dataload

In [8]:
validation_text_path = r"D:\Glove model\RVL-CDIP dataset\labels\val.txt"
valid_df = pd.read_csv(validation_text_path, sep=" ", header=None, names=["image_path", "label"])

In [9]:
len(valid_df)

40000

In [10]:
valid_df['label'] = [str(i) for i in valid_df['label']]

valid_df['image_path'] = [r'D:/Glove model/RVL-CDIP dataset/images/' + this_path for this_path in valid_df['image_path']]
valid_df.head()

Unnamed: 0,image_path,label
0,D:/Glove model/RVL-CDIP dataset/images/imagesg...,11
1,D:/Glove model/RVL-CDIP dataset/images/imagesi...,0
2,D:/Glove model/RVL-CDIP dataset/images/imagesr...,0
3,D:/Glove model/RVL-CDIP dataset/images/imagesk...,4
4,D:/Glove model/RVL-CDIP dataset/images/imagesr...,14


# Look for common image in both tobaco3482 and RVL-CDIP dataset

In [11]:
tobaco3482_path = r"C:/Users/Rahul Roy/Desktop/finalyr_project_dataset/tobaco_dataset/Tobacco3482-jpg/"
data_root = pathlib.Path(tobaco3482_path)

print(data_root)
for item in data_root.iterdir():
    print(item)

C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg
C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg\ADVE
C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg\Email
C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg\Form
C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg\Letter
C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg\Memo
C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg\News
C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg\Note
C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg\Report
C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg\Resume
C:\Users\Rahul Roy\Desktop\finalyr_project_dataset\tobaco_dataset\Tobacco3482-jpg\Scientific


In [12]:
def get_file_paths_and_labels(data_root):
    img_paths = [str(path) for path in data_root.glob('*\\*.jpg')]
    labels = [p.split("\\")[-2] for p in img_paths]
    return img_paths, labels

img_paths, labels = get_file_paths_and_labels(data_root)
print(img_paths)
print(labels)
print(len(img_paths))
print(len(labels))

['C:\\Users\\Rahul Roy\\Desktop\\finalyr_project_dataset\\tobaco_dataset\\Tobacco3482-jpg\\ADVE\\0000136188.jpg', 'C:\\Users\\Rahul Roy\\Desktop\\finalyr_project_dataset\\tobaco_dataset\\Tobacco3482-jpg\\ADVE\\0000435350.jpg', 'C:\\Users\\Rahul Roy\\Desktop\\finalyr_project_dataset\\tobaco_dataset\\Tobacco3482-jpg\\ADVE\\0000556056.jpg', 'C:\\Users\\Rahul Roy\\Desktop\\finalyr_project_dataset\\tobaco_dataset\\Tobacco3482-jpg\\ADVE\\0030048095.jpg', 'C:\\Users\\Rahul Roy\\Desktop\\finalyr_project_dataset\\tobaco_dataset\\Tobacco3482-jpg\\ADVE\\0030048989.jpg', 'C:\\Users\\Rahul Roy\\Desktop\\finalyr_project_dataset\\tobaco_dataset\\Tobacco3482-jpg\\ADVE\\0030049569.jpg', 'C:\\Users\\Rahul Roy\\Desktop\\finalyr_project_dataset\\tobaco_dataset\\Tobacco3482-jpg\\ADVE\\03496270.jpg', 'C:\\Users\\Rahul Roy\\Desktop\\finalyr_project_dataset\\tobaco_dataset\\Tobacco3482-jpg\\ADVE\\03567810.jpg', 'C:\\Users\\Rahul Roy\\Desktop\\finalyr_project_dataset\\tobaco_dataset\\Tobacco3482-jpg\\ADVE\\037

In [13]:
tobaco3482_df = pd.DataFrame(list(zip(img_paths, labels)),
               columns =['image_path', 'data_label'])
tobaco3482_df.head()

Unnamed: 0,image_path,data_label
0,C:\Users\Rahul Roy\Desktop\finalyr_project_dat...,ADVE
1,C:\Users\Rahul Roy\Desktop\finalyr_project_dat...,ADVE
2,C:\Users\Rahul Roy\Desktop\finalyr_project_dat...,ADVE
3,C:\Users\Rahul Roy\Desktop\finalyr_project_dat...,ADVE
4,C:\Users\Rahul Roy\Desktop\finalyr_project_dat...,ADVE


In [14]:
#out = tobaco3482_df['image_path'][0].split("\\")[-1]

def extract_filename_tobaco(path_list):
    res = []
    for this_path in path_list:
        out = this_path.split("\\")[-1]
        out = out[:-4] + ".tif"
        res.append(out)
    return res

In [15]:
tobaco_file_name = extract_filename_tobaco(tobaco3482_df['image_path'])
print(len(tobaco_file_name))

3482


In [16]:
def extract_filename_rvlcdip(path_list):
    res = []
    for this_path in path_list:
        out = this_path.split("/")[-1]
        res.append(out)
    return res    
        

In [17]:
train_file_name = extract_filename_rvlcdip(train_df['image_path'])
test_file_name = extract_filename_rvlcdip(test_df['image_path'])
valid_file_name = extract_filename_rvlcdip(valid_df['image_path'])

rvlcdip_file_name = test_file_name + train_file_name + valid_file_name

print(len(train_file_name))
print(len(test_file_name))
print(len(valid_file_name))

print(len(rvlcdip_file_name)) #322 - 26 - 39

320000
40000
40000
400000


In [33]:
def common(lst1, lst2): 
    return list(set(lst1) & set(lst2))

e=common(tobaco_file_name , valid_file_name)

In [40]:
print(len(e))

39


In [38]:
320000-257

319743

# Remove the image which is present in tobaco3482

In [67]:
common_file_name = common(tobaco_file_name, train_file_name)
print(len(common_file_name))
print(common_file_name[1:10])

257
['507995262.tif', '505576130+-6130.tif', '50650671-0672.tif', '40002609-2610.tif', '2076907631_7668.tif', '87064470.tif', '50294272-4272.tif', '2030734306.tif', '2054406337.tif']


In [68]:
idx = []
for i, this_path in enumerate(train_df['image_path']):
    if this_path.split("/")[-1] in common_file_name:
        idx.append(i)

print(idx)
print(len(idx))

[494, 1832, 2661, 3863, 4326, 5962, 6497, 7218, 8903, 10289, 12799, 13907, 14748, 17704, 21317, 21658, 22067, 22228, 23464, 23776, 24640, 27449, 27580, 28876, 30474, 32165, 32191, 34392, 34470, 34775, 35042, 36955, 37211, 38036, 38097, 41258, 41675, 44508, 45954, 46112, 48588, 48921, 50850, 51273, 57369, 57405, 60499, 62132, 62387, 62565, 63025, 64227, 65811, 69455, 71012, 71548, 72298, 72946, 76480, 76899, 77972, 79673, 80034, 82004, 82119, 82159, 82858, 83885, 84268, 84755, 86500, 89934, 91583, 92003, 92401, 92825, 94649, 94896, 95163, 96796, 97524, 97535, 99064, 99103, 99296, 100701, 107903, 108659, 109292, 109880, 112409, 113849, 115246, 115435, 117743, 119821, 120986, 121538, 122159, 123726, 123799, 123803, 125396, 126266, 128492, 130867, 133663, 133724, 135605, 137825, 139219, 139811, 142583, 142751, 144392, 144813, 145176, 146821, 147191, 151057, 151476, 151981, 152716, 153299, 155765, 156477, 158113, 159323, 159751, 160031, 162056, 164010, 167164, 167335, 168225, 171145, 171674

In [69]:
try:
    train_df.drop(idx, axis=0, inplace=True)
except KeyError:
    pass

In [70]:
train_df = train_df.reset_index(drop=True)
train_df.tail()


Unnamed: 0,image_path,label
319737,D:/Glove model/RVL-CDIP dataset/images/imagesu...,9
319738,D:/Glove model/RVL-CDIP dataset/images/imagesa...,15
319739,D:/Glove model/RVL-CDIP dataset/images/imagesu...,6
319740,D:/Glove model/RVL-CDIP dataset/images/imagesd...,9
319741,D:/Glove model/RVL-CDIP dataset/images/imagesp...,3


In [73]:
print(train_df.shape)
print(type(train_df['label'][0]))

(319742, 2)
<class 'str'>


# Keras datagenerator

In [74]:
import tensorflow as tf
from tensorflow import keras

batch_size=32
seed_value = 42

def scalar(x):
    return x/127.5-1 # rescales pixels to range -1 to +1


trgen=tf.keras.preprocessing.image.ImageDataGenerator(preprocessing_function=scalar)
train_gen=trgen.flow_from_dataframe(train_df, x_col='image_path', y_col='label', target_size=(224,224), 
                                    class_mode='categorical',batch_size=batch_size, shuffle=True, seed=seed_value)

Found 319742 validated image filenames belonging to 16 classes.


In [75]:
tvgen=tf.keras.preprocessing.image.ImageDataGenerator(preprocessing_function=scalar)
valid_gen=tvgen.flow_from_dataframe(valid_df, x_col='image_path', y_col='label', target_size=(224,224), 
                                    class_mode='categorical',batch_size=batch_size, shuffle=False, seed=seed_value)

Found 40000 validated image filenames belonging to 16 classes.


In [None]:
# determine test generator batch size and steps to go through the test set exactly once for predictions
length=len(test_df)
test_batch_size=sorted([int(length/n) for n in range(1,length+1) if length % n ==0 and length/n<=batch_size],reverse=True)[0]  
test_steps=int(length/test_batch_size)


ttgen=tf.keras.preprocessing.image.ImageDataGenerator(preprocessing_function=scalar)
test_gen=ttgen.flow_from_dataframe(test_df, x_col='image_path', y_col='label', target_size=(224,224),  
                                   class_mode='categorical',batch_size=test_batch_size, shuffle=True, seed=seed_value)

test_labels=test_gen.labels

In [79]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam, Adamax
from tensorflow.keras.metrics import categorical_crossentropy

img_shape=(224,224,3)
class_count = 16
dropout=0.2
lr= 0.001
seed_value = 42


base_model= VGG16(include_top=False,
                   input_shape=(224,224,3),
                   pooling='avg',
                   classes=16)

base_model.trainable=True

inputs = keras.Input(shape=img_shape)
x = base_model(inputs, training=True)

# x=tf.keras.layers.Dropout(rate=dropout, seed=seed_value)(x)
x =tf.keras.layers.Dense(512, kernel_regularizer = regularizers.l2(l = 0.016),activity_regularizer=regularizers.l1(0.006), bias_regularizer=regularizers.l1(0.006) ,activation='relu', kernel_initializer= tf.keras.initializers.GlorotUniform(seed=seed_value))(x)
# x=tf.keras.layers.Dropout(rate=dropout, seed=seed_value)(x)
outputs=tf.keras.layers.Dense(class_count, activation='softmax',kernel_initializer=tf.keras.initializers.GlorotUniform(seed=seed_value))(x)
model=Model(inputs, outputs)

model.compile(Adamax(learning_rate=lr), loss='categorical_crossentropy', metrics=['accuracy'])

In [82]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 vgg16 (Functional)          (None, 512)               14714688  
                                                                 
 dense_4 (Dense)             (None, 512)               262656    
                                                                 
 dense_5 (Dense)             (None, 16)                8208      
                                                                 
Total params: 14,985,552
Trainable params: 14,985,552
Non-trainable params: 0
_________________________________________________________________


In [None]:
weights = model.get_weights()