In [13]:
import tensorflow as tf
import keras as k
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random as r
import numpy as np
import cv2
import imghdr
import os
import itertools

from sklearn.metrics import confusion_matrix
from keras.preprocessing.image import ImageDataGenerator, load_img
from tensorflow.keras.applications import ResNet50V2, Xception, EfficientNetB4
from tensorflow.keras import layers
from tensorflow.keras.models import Model, load_model
from keras import backend
from collections import Counter
from PIL import ImageFile, Image
ImageFile.LOAD_TRUNCATED_IMAGES = True

CLEAN = False
  
print(f'Tensorflow version {tf.version.VERSION}')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)
print(f'Removing nontype files is: {CLEAN}')


Tensorflow version 2.5.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Removing nontype files is: False


## Data processing

In [14]:
VAL_SPLIT = 0.25
HEIGHT = 300  #common aspect 2:3
WIDTH = 200
BATCH_SIZE = 16
PATH_TRAIN = 'C:/Users/crims/Tensorflow/PublisherIdetification/realpublishers/training/'
PATH_TEST = 'C:/Users/crims/Tensorflow/PublisherIdetification/realpublishers/testing/'

train_generator = ImageDataGenerator(rescale=1./255, validation_split=VAL_SPLIT)
test_generator = ImageDataGenerator(rescale=1./255)

print('Training folder:')
train_data = train_generator.flow_from_directory(PATH_TRAIN, target_size=(WIDTH, HEIGHT),
                                                 class_mode = 'categorical', batch_size=BATCH_SIZE, 
                                                 subset = 'training')

class_counter = list(Counter(train_data.classes).values())
class_names = list(train_data.class_indices)
class_num = train_data.num_classes
print(f'Class and num {dict(zip(class_names, class_counter))}') 

print()
print('Validation folder:')
valid_data = train_generator.flow_from_directory(PATH_TRAIN, target_size=(WIDTH, HEIGHT),
                                                 class_mode = 'categorical', batch_size=BATCH_SIZE, 
                                                 subset = 'validation')
print()
print('Test folder:')
test_data = test_generator.flow_from_directory(PATH_TEST, target_size=(WIDTH, HEIGHT), 
                                               class_mode=None, batch_size=1, 
                                               shuffle=False)


Training folder:
Found 9809 images belonging to 4 classes.
Class and num {'IEEE': 144, 'Macmillan': 2586, 'Springer Nature': 5768, 'Wolters Kluwer Health': 1311}

Validation folder:
Found 3269 images belonging to 4 classes.

Test folder:
Found 0 images belonging to 1 classes.


## Cleaning data (run only once per dataset)

In [15]:
if CLEAN == True:
    filenames = train_data.filenames
    n = 0
    while n < train_data.n:
        path = f'{PATH_TRAIN}{filenames[n]}'
        image = cv2.imread(path)
        img_type = imghdr.what(path)
        if img_type != "jpeg":
            print(f'Removing image from {path}')
            os.remove(path)
            n += 1
        else:
            n += 1
    print('All done!')    
else:
    print('Skipping clean')

Skipping clean


In [16]:
# size = os.stat(real_path).st_size
# def getmd5(filename):
#     file_txt = open(filename,'rb').read()
#     m = hashlib.md5(file_txt)
#     return m.hexdigest()
# if size in all_size.keys():
#     new_md5 = getmd5(real_path)
#     if all_size[size][1] == '':
#         all_size[size][1] = getmd5(all_size[size][0])
#     if new_md5 in all_size[size]:
#         os.remove(real_path)
#         Print('delete', file)
#         total_delete += 1
#     else:
#         all_size[size].append(new_md5)
# else:
#     all_size[size] = name_and_md5

In [17]:
def plot_confusion_matrix(cm, class_names):
  """
  Returns a matplotlib figure containing the plotted confusion matrix.
 
  Args:
    cm (array, shape = [n, n]): a confusion matrix of integer classes
    class_names (array, shape = [n]): String names of the integer classes
  """
  figure = plt.figure(figsize=(8, 8))
  plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
  plt.title("Confusion matrix")
  plt.colorbar()
  tick_marks = np.arange(len(class_names))
  plt.xticks(tick_marks, class_names, rotation=45)
  plt.yticks(tick_marks, class_names)
 
  # Normalize the confusion matrix.
  cm = np.around(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], decimals=2)
 
  # Use white text if squares are dark; otherwise black.
  threshold = cm.max() / 2.
  for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    color = "white" if cm[i, j] > threshold else "black"
    plt.text(j, i, cm[i, j], horizontalalignment="center", color=color)
 
  plt.tight_layout()
  plt.ylabel('True label')
  plt.xlabel('Predicted label')
  return figure

In [19]:
def log_confusion_matrix(epoch, logs):
  # Use the model to predict the values from the validation dataset.
  test_pred_raw = model.predict(x_test)
  test_pred = np.argmax(test_pred_raw, axis=1)
  y_test_cls = np.argmax(y_test, axis=1)
 
  # Calculate the confusion matrix.
  cm = confusion_matrix(y_test_cls, test_pred)
 
  figure = plot_confusion_matrix(cm, class_names=class_names)
  cm_image = plot_to_image(figure)
 
  # Log the confusion matrix as an image summary.
  with file_writer_cm.as_default():
    tf.summary.image("Confusion Matrix", cm_image, step=epoch)
 
# Define the per-epoch callback to plot confusion metrics after each epoch.
cm_callback = tf.keras.callbacks.LambdaCallback(on_epoch_end=log_confusion_matrix)

## Model setup & parameters

In [22]:
#Parameters
NETWORK = 'Efficient'
LEARN_RATE = 5e-4    #xception ~ 5e-4, efficient ~ 9e-5 
OPTIMIZER = 'Adam'  #adam, rms, sgd
POOLING = 'avg'     #max, avg, none
EPOCHS = 5
NUM_TRAIN_BATCHES = 100 #BATCH_SIZE * NUM_TRAIN_BATCHES is images processed per epoch
NUM_VAL_BATCHES = NUM_TRAIN_BATCHES * VAL_SPLIT

choice_model = {'Xception' : Xception(include_top=False, weights='imagenet', 
                                        input_shape=(WIDTH, HEIGHT, 3), pooling=POOLING),
                'Efficient' : EfficientNetB4(include_top=False, weights='imagenet', 
                                        input_shape=(WIDTH, HEIGHT, 3), pooling=POOLING),
                'Resnet50' : ResNet50V2(include_top=False, weights='imagenet', 
                                        input_shape=(WIDTH, HEIGHT, 3), pooling=POOLING)}
base = choice_model[NETWORK]

choice_opt = {'RMS' : tf.keras.optimizers.RMSprop(learning_rate=LEARN_RATE), #default 1e-2
              'Adam': tf.keras.optimizers.Adam(learning_rate=LEARN_RATE), #default 1e-3
               'SGD' : tf.keras.optimizers.SGD(learning_rate=LEARN_RATE)}  #default 1e-3
opt = choice_opt[OPTIMIZER]

x = base.output
#x = layers.Dense(32, activation='relu')(x)
x = layers.Dense(class_num, activation='softmax')(x)
model = Model(base.input, x)
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer=opt)

tensorflow_callback = k.callbacks.TensorBoard(
    log_dir= f'tensorboard_logs/{NETWORK}_{OPTIMIZER}_E{EPOCHS}B{BATCH_SIZE}_lr{LEARN_RATE}_pool{POOLING}', 
    histogram_freq=1, write_graph=True, write_images=True
)

file_writer_cm = tf.summary.create_file_writer(f'tensorboard_logs/{NETWORK}_{OPTIMIZER}_E{EPOCHS}B{BATCH_SIZE}_lr{LEARN_RATE}_pool{POOLING}/cm')
model.summary()


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 200, 300, 3) 0                                            
__________________________________________________________________________________________________
rescaling_3 (Rescaling)         (None, 200, 300, 3)  0           input_11[0][0]                   
__________________________________________________________________________________________________
normalization_3 (Normalization) (None, 200, 300, 3)  7           rescaling_3[0][0]                
__________________________________________________________________________________________________
stem_conv_pad (ZeroPadding2D)   (None, 201, 301, 3)  0           normalization_3[0][0]            
____________________________________________________________________________________________

## Run model

In [23]:
#Train model
history = model.fit(train_data, validation_data=valid_data, callbacks=[tensorflow_callback, cm_callback],
                       epochs=EPOCHS, steps_per_epoch=NUM_TRAIN_BATCHES, validation_steps=NUM_VAL_BATCHES, 
                       batch_size=BATCH_SIZE, verbose =1)

model.save(f'{NETWORK}_publisherid')



Epoch 1/5


NameError: name 'test_images' is not defined

## Model evaluation 

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.ylim([0, 1])
plt.xlabel('epoch')
plt.xlim([0, EPOCHS])
plt.grid()
plt.legend(['train', 'valid'], loc='upper left')

plt.subplot(122)
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.ylim([0.3, 1])
plt.xlabel('epoch')
plt.xlim([0, EPOCHS])
plt.legend(['train', 'valid'], loc='upper left')
plt.grid()
plt.show()

## Predictor

In [None]:
n = r.randint(0,valid_data.n)
filenames = valid_data.filenames
path = f'{PATH_TRAIN}{filenames[n]}'
pic = mpimg.imread(path)
plt.imshow(pic)
plt.show()

img = tf.keras.preprocessing.image.load_img(path, target_size=(WIDTH, HEIGHT))
img_array = tf.keras.preprocessing.image.img_to_array(img)
img_batch = np.expand_dims(img_array, axis=0)
img_processed = tf.keras.applications.xception.preprocess_input(img_batch)

prediction = model.predict(img_processed)
Top_index = np.argsort(np.max(prediction, axis=0))[-1]
Second_index = np.argsort(np.max(prediction, axis=0))[-2]

sort = np.sort(max(prediction))
print(f'1st predict {class_names[Top_index]} with confidence {round(sort[len(sort) - 1]*100, 3)}%')
print(f'2nd predict {class_names[Second_index]} with confidence {round(sort[len(sort) - 2] * 100, 3)}%')           
print(f'Answer is {filenames[n][:]}')
