In [None]:
#importing all necessary libraries
import numpy as np
from glob import glob
from tqdm import tqdm
import PIL
import cv2
from keras.utils import np_utils
from keras.preprocessing import image
from sklearn.datasets import load_files
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
from keras.models import Sequential
from keras.models import Model
from keras import optimizers
import keras

In [None]:
#function to load data from given path
def load_dataset(path):
    data = load_files(path)
    doc_files = np.array(data['filenames'])
    doc_targets = np_utils.to_categorical(np.array(data['target']), 16)
    #print(np.array(data['filenames']))
    #print(np.array(data['target']))
    return doc_files, doc_targets

In [None]:
#Loading data from appropriate directories
train_files, train_targets = load_dataset('D:/Eduwaive Project/RVL_CDIP/train')
valid_files, valid_targets = load_dataset('D:/Eduwaive Project/RVL_CDIP/validation')
test_files, test_targets = load_dataset('D:/Eduwaive Project/RVL_CDIP/test')


In [None]:
#doc_name will give all different types of categories on which image is to be categorized
doc_names = [item[35:-1] for item in (glob("D:/Eduwaive Project/RVL_CDIP/train/*/"))]
print(doc_names)

In [None]:
#checking the dataset
print('There are %d total doc categories.' % len(doc_names))
print('There are %s total doc images.\n' % len(np.hstack([train_files, valid_files])))
print('There are %d training doc images.' % len(train_files))
print('There are %d validation doc images.' % len(valid_files))
print('There are %d test doc images.'% len(test_files))

In [None]:
#function that converts image into 4D array to facilitate Keras CNN
def convert_4darray(img_path):
    #loads image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=(224, 224))
    #convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
    x = image.img_to_array(img)
    #convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

In [None]:
#funtion which converts all images in given path to 4D array for Keras CNN
def convert_4darrays(img_paths):
    list_of_tensors = [convert_4darray(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

In [None]:
#rescaling the images by dividing eveyr pixel in every image by 255 - preprocess data for Keras
train_tensors = convert_4darrays(train_files).astype('float32')/255
test_tensors = convert_4darrays(test_files).astype('float32')/255
valid_tensors = convert_4darrays(valid_files).astype('float32')/255

In [None]:
#model Xception which is best for image classification but accuacy is low because of low training if we will train it on full dataset them it accuracy will increase 
base_model=keras.applications.xception.Xception(include_top=False, weights='imagenet', input_shape=(224,224,3))

In [None]:
x = base_model.output

In [None]:
print(x)

In [None]:
#Flatten layer
x = Flatten()(x)

In [None]:
predictions = Dense(16, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)
model.summary()

In [None]:
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',optimizer=sgd,metrics=['accuracy'])
EPOCHS=50

In [None]:
fit_model = model.fit(train_tensors, train_targets,validation_data=(valid_tensors, valid_targets),epochs=EPOCHS, batch_size=5,verbose=1)

In [None]:
predictions = [np.argmax(model.predict(np.expand_dims(tensor, axis=0))) for tensor in test_tensors]


In [None]:
test_accuracy = 100*np.sum(np.array(predictions)==np.argmax(test_targets, axis=1))/len(predictions)
print('Test accuracy: %.4f%%' % test_accuracy)
#print(len(predictions))
#print(len(test_files))
'''for i in range(len(test_files)):
    print(test_files[i])'''

In [None]:
#checking our model output on random image 
test_image=image.load_img('01075756.tif',target_size=(224,224))
test_image=image.img_to_array(test_image)
test_image=np.expand_dims(test_image, axis = 0)#making the image in a single array as axis=0
result=model.predict(test_image)
print(result)

In [None]:
#knowing which category the image belongs
for i in result:
    for j in range(16):
        if int(i[j])==1:
            print(j)
            print(doc_names[j])
            

In [None]:
#making 16 different folders to store the images as classified by the model
import os
os.chdir("C:/Users/sam/Documents/Intake")
os.mkdir("answers4")
os.chdir("C:/Users/sam/Documents/Intake/answers4")

for i in range(16):
    os.mkdir(str(i))

In [None]:
#copying the image according as they are categorized
import shutil

for i in range(len(test_files)):
    for j in range(16):
        if predictions[i]==j:
            shutil.copy2(test_files[i],'C:/Users/sam/Documents/Intake/answers4/'+str(j) +'/')
        else:
            continue

In [None]:
#naming all the folders 
class_name={'0':'advertisement','1':'budget','2':'email','3':'file folder','4':'form','5':'handwritten','6':'invoice','7':'letter','8':'memo','9':'news article','10':'presentation','11':'questionnaire','12':'resume','13':'scientific publication','14':'scientific report','15':'specification'}

In [None]:

path = 'C:/Users/sam/Documents/Intake/answers4'

i = 0
for j in class_name.keys():
    os.rename(path+'/'+j, path+'/'+class_name[j])
    i=i+1