In [12]:
import tensorflow as tf
import keras as k
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random as r
import numpy as np
import cv2
import imghdr
import os
import itertools
import datetime
import PIL
import tensorflow_datasets as tfds
import io

from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.optimizers import RMSprop, Adam, SGD
from tensorflow.keras.applications import ResNet50V2, Xception, EfficientNetB3, EfficientNetB4, EfficientNetB5
from tensorflow.keras import layers
from tensorflow.keras.models import Model, load_model
from keras import backend

from collections import Counter
from PIL import ImageFile, Image, ImageOps
from sklearn.metrics import confusion_matrix

ImageFile.LOAD_TRUNCATED_IMAGES = True
  
print(f'Tensorflow version {tf.version.VERSION}')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)


Tensorflow version 2.5.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Data processing

In [2]:
PATH_TRAIN = 'C:/Users/crims/Tensorflow/PublisherIdetification/realpublishers/training/'
PATH_TEST = 'C:/Users/crims/Tensorflow/PublisherIdetification/realpublishers/testing/'

class DataGenerator:
    def __init__(self, val_split, height, width, batch_size, steps):
        self.val_split = val_split
        self.height = height
        self.width = width
        self.batch_size = batch_size
        self.steps = steps
    
    def train_generator(self):    
        train_generator = ImageDataGenerator(rescale=1./255, validation_split=self.val_split)
        print('Training folder:')
        self.train_data = train_generator.flow_from_directory(PATH_TRAIN, target_size=(self.width, self.height),
                                                     class_mode = 'categorical', batch_size=self.batch_size, 
                                                     subset = 'training')
        print('Validation folder:')
        self.valid_data = train_generator.flow_from_directory(PATH_TRAIN, target_size=(self.width, self.height),
                                                     class_mode = 'categorical', batch_size=self.batch_size, 
                                                     subset = 'validation')
        print()

    def test_generator(self):
        test_generator = ImageDataGenerator(rescale=1./255)
        print('Test folder:')
        self.test_data = test_generator.flow_from_directory(PATH_TEST, target_size=(WIDTH, HEIGHT), 
                                                   class_mode=None, batch_size=1, shuffle=False)
        print()
        
    def generator_info(self):
        self.class_counter = list(Counter(self.train_data.classes).values())
        self.class_names = list(self.train_data.class_indices)
        self.class_num = self.train_data.num_classes
        print(f'Class name and # {dict(zip(self.class_names, self.class_counter))}')
        print(f'Num files trained {self.batch_size * self.steps} and validated {(self.batch_size * self.steps * self.val_split):.0f} per epoch')
        print(f'Images resized to {self.height}x{self.width} trained avg { (self.train_data.n  // (self.batch_size * self.steps)):.1f} epochs' )
        print()
        
    def clean_data(self):
        self.filenames = self.train_data.filenames
        n = 0
        while n < self.train_data.n:
            path = f'{PATH_TRAIN}{self.filenames[n]}'
            image = cv2.imread(path)
            img_type = imghdr.what(path)
            if img_type != "jpeg":
                print(f'Removing image from {path}')
                os.remove(path)
                n += 1
            else:
                n += 1
        print('All done!') 

In [4]:
def plot_to_image(figure):
    """Converts the matplotlib plot specified by 'figure' to a PNG image and
    returns it. The supplied figure is closed and inaccessible after this call."""

    # Save the plot to a PNG in memory.
    buf = io.BytesIO()
    plt.savefig(buf, format="png")

    # Closing the figure prevents it from being displayed directly inside
    # the notebook.
    plt.close(figure)
    buf.seek(0)

    # Convert PNG buffer to TF image
    image = tf.image.decode_png(buf.getvalue(), channels=4)

    # Add the batch dimension
    image = tf.expand_dims(image, 0)
    return image


def image_grid(data, labels, class_names):
    # Data should be in (BATCH_SIZE, H, W, C)
    assert data.ndim == 4

    figure = plt.figure(figsize=(10, 10))
    num_images = data.shape[0]
    size = int(np.ceil(np.sqrt(num_images)))

    for i in range(data.shape[0]):
        plt.subplot(size, size, i+1 , title=class_names[labels[i]])
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)

        # if grayscale
        if data.shape[3] == 1:
            plt.imshow(data[i], cmap=plt.cm.binary)

        else:
            plt.imshow(data[i])

    return figure


In [30]:
# img = load_img(dat1.train_data.filepaths[1])
# img_arr = img_to_array(img)
# data = np.reshape(img_arr, (1, 182, 140, -1))




TypeError: 'int' object is not callable

In [75]:
data = []
for i in range(5):
    innerlist = []
    for p in range(5):
        img = load_img(dat1.train_data.filepaths[i])
        img_arr = img_to_array(img)
        innerlist.append(img_arr)
    data.append(innerlist)
    
print(np.shape(data))
data
        #img_arr = img_to_array(img)
        #y, x, z = img_arr.shape
        #print(img_arr.shape)
        #data = np.reshape(img_arr, (i+1, y, x, -1))
    
    #data.shape
    #num_images = data.shape[0]
    #int(np.ceil(np.sqrt(num_images)))

(5, 5)


[[array([[[ 39.,  27.,  67.],
          [ 35.,  26.,  53.],
          [ 29.,  29.,  27.],
          ...,
          [ 26.,  35.,  14.],
          [ 28.,  33.,  27.],
          [ 19.,  20.,  22.]],
  
         [[ 34.,  25.,  52.],
          [  1.,   0.,   9.],
          [ 25.,  29.,  12.],
          ...,
          [  7.,  23.,   0.],
          [ 25.,  34.,  15.],
          [  0.,   3.,   0.]],
  
         [[ 22.,  22.,  20.],
          [ 27.,  31.,  14.],
          [172., 188., 139.],
          ...,
          [176., 201., 135.],
          [161., 178., 134.],
          [ 27.,  42.,   9.]],
  
         ...,
  
         [[ 10.,  36.,   0.],
          [  1.,  27.,   0.],
          [163., 193., 133.],
          ...,
          [169., 203., 119.],
          [167., 197., 127.],
          [  4.,  30.,   0.]],
  
         [[ 26.,  42.,  29.],
          [ 32.,  49.,  31.],
          [157., 177., 150.],
          ...,
          [164., 189., 132.],
          [146., 165., 120.],
          [ 29.,  45.,

In [32]:
%load_ext tensorboard
%tensorboard --logdir tensorboard_logs/

Reusing TensorBoard on port 6006 (pid 15936), started 3:59:07 ago. (Use '!kill 15936' to kill it.)

## Model

In [23]:
class BuildModel:
    def __init__(self, network, pooling, optimizer, learn_rate, epochs):
        self.network = network
        self.lr = learn_rate
        self.opt = optimizer
        self.pool = pooling
        self.epochs = epochs
        self.val_batch = dat1.steps * dat1.val_split
        
    def compile_model(self):
        base = self.network(include_top=False, weights='imagenet', 
                     input_shape=(dat1.width, dat1.height, 3), pooling=self.pool)
        opt = self.opt(learning_rate=self.lr)
        
        x = base.output
        x = layers.Dense(dat1.class_num, activation='softmax')(x)
        self.model = tf.keras.Model(base.input, x)
        self.model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer=opt)    
              
    def run_model(self, summary, save):
        if summary == True:
            self.model.summary()
            
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M")    
        tensorflow_callback = k.callbacks.TensorBoard(
            log_dir= f'tensorboard_logs/{self.network.__name__}{self.pool}_{self.opt.__name__}lr{self.lr}_E{self.epochs}B{dat1.batch_size}-{current_time}', 
            histogram_freq=1, write_graph=True, write_images=True)
        #file_writer_cm = tf.summary.create_file_writer(f'tensorboard_logs/{NETWORK}_{OPTIMIZER}_E{EPOCHS}B{BATCH_SIZE}_lr{LEARN_RATE}_pool{POOLING}/cm')       
        history = self.model.fit(dat1.train_data, validation_data=dat1.valid_data, 
                            callbacks=[tensorflow_callback], epochs=self.epochs, 
                            steps_per_epoch=dat1.steps , validation_steps=self.val_batch, 
                            batch_size=dat1.batch_size, verbose =1)
          
        if save == True:
            model.save(f'{NETWORK}_publisherid - {current_time}')


In [24]:
# Arguments: val_split, height, width, batch_size, steps
dat1 = DataGenerator(0.25, 225, 150, 32, 10)
dat1.train_generator()
dat1.generator_info()
# Arguments: network, pooling, optimizer, learn_rate, epochs
mod1 = BuildModel(EfficientNetB4, 'max', Adam, 1e-4, 1) #xception ~ 5e-4, efficient ~ 9e-5
# If the position of objects is important Avg pool if not Max Pooling 
mod1.compile_model()
# Arguments: Summary, save model
#mod1.run_model(False, False)

Training folder:
Found 9809 images belonging to 4 classes.
Validation folder:
Found 3269 images belonging to 4 classes.

Class name and # {'IEEE': 144, 'Macmillan': 2586, 'Springer Nature': 5768, 'Wolters Kluwer Health': 1311}
Num files trained 320 and validated 80 per epoch
Images resized to 225x150 trained avg 30.0 epochs



## Predictor

In [None]:
class Predictor:
    def randimg_predict(self):
        n = r.randint(0, dat1.valid_data.n)
        filenames = dat1.valid_data.filenames
        path = f'{PATH_TRAIN}{filenames[n]}'
        pic = mpimg.imread(path)
        plt.axis('off')
        plt.imshow(pic)
        plt.show()

        img = tf.keras.preprocessing.image.load_img(path, target_size=(dat1.width, dat1.height))
        img_array = tf.keras.preprocessing.image.img_to_array(img)
        img_batch = np.expand_dims(img_array, axis=0)
        img_processed = tf.keras.applications.xception.preprocess_input(img_batch)

        prediction = mod1.model(img_processed, training=False)
        Top_index = np.argsort(np.max(prediction, axis=0))[-1]
        Second_index = np.argsort(np.max(prediction, axis=0))[-2]

        sort = np.sort(max(prediction))
        print(f'1st predict {dat1.class_names[Top_index]} with conf {round(sort[len(sort) - 1]*100)}%')
        print(f'2nd predict {dat1.class_names[Second_index]} with conf {round(sort[len(sort) - 2] * 100)}%')           
        print(f'Answer is {filenames[n][:]}')
        
    def batch_predict(self, steps):
        self.steps = steps
        mod1.model.evaluate(dat1.valid_data, batch_size=dat1.batch_size, steps=self.steps, return_dict=True)


In [None]:
predict1 = Predictor()
predict1.randimg_predict()
predict1.batch_predict(100)