In [1]:
import tensorflow as tf
import keras as k
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random as r
import numpy as np
import cv2
import imghdr
import os
import datetime
import PIL
import io

from keras import backend
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.optimizers import RMSprop, Adam, SGD
from tensorflow.keras.applications import ResNet50V2, Xception, EfficientNetB3, EfficientNetB4, EfficientNetB5
from tensorflow.keras import layers
from tensorflow.keras.models import Model, load_model
from collections import Counter
from PIL import ImageFile, Image, ImageOps
from sklearn.metrics import confusion_matrix
  
print(f'Tensorflow version {tf.version.VERSION}')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)


Tensorflow version 2.5.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Data processing

In [124]:
PATH_TRAIN = 'C:/Users/crims/Tensorflow/PublisherIdetification/realpublishers/training/'
PATH_TEST = 'C:/Users/crims/Tensorflow/PublisherIdetification/realpublishers/testing/'

class DataGenerator:
    def __init__(self, val_split, height, width, batch_size, steps):
        self.val_split = val_split
        self.height = height
        self.width = width
        self.batch_size = batch_size
        self.steps = steps
        
    def args(self):
        return (self.width, self.height)
    
    def train_generator(self):    
        train_generator = ImageDataGenerator(rescale=1./255, validation_split=self.val_split)
        print('Training folder:')
        self.train_data = train_generator.flow_from_directory(PATH_TRAIN, target_size=(self.width, self.height),
                                                     class_mode = 'categorical', batch_size=self.batch_size, 
                                                     subset = 'training')
        print('Validation folder:')
        self.valid_data = train_generator.flow_from_directory(PATH_TRAIN, target_size=(self.width, self.height),
                                                     class_mode = 'categorical', batch_size=self.batch_size, 
                                                     subset = 'validation')
        print()

    def test_generator(self):
        test_generator = ImageDataGenerator(rescale=1./255)
        print('Test folder:')
        self.test_data = test_generator.flow_from_directory(PATH_TEST, target_size=(self.width, self.height), 
                                                   class_mode=None, batch_size=1, shuffle=False)
        print()
        
    def generator_info(self):
        self.class_num = self.train_data.num_classes
        self.class_counter = list(Counter(self.train_data.classes).values())
        self.class_names = list(self.train_data.class_indices)
        self.labels = self.train_data.labels
        print(f'Class name and # {dict(zip(self.class_names, self.class_counter))}')
        print(f'Num files trained {self.batch_size * self.steps} and validated {(self.batch_size * self.steps * self.val_split):.0f} per epoch')
        print(f'Images resized to {self.height}x{self.width} trained avg {(self.train_data.n  // (self.batch_size * self.steps)):.1f} epochs' )
        print()
        
    def clean_data(self):
        filenames = self.train_data.filenames
        n = 0
        while n < self.train_data.n:
            path = f'{PATH_TRAIN}{filenames[n]}'
            image = cv2.imread(path)
            img_type = imghdr.what(path)
            if img_type != "jpeg":
                print(f'Removing image from {path}')
                os.remove(path)
                n += 1
            else:
                n += 1
        print('All done!') 

In [83]:
# Arguments: val_split, height, width, batch_size, steps
dat1 = DataGenerator(0.25, 225, 150, 32, 100)
dat1.train_generator()
dat1.generator_info()

Training folder:
Found 9809 images belonging to 4 classes.
Validation folder:
Found 3269 images belonging to 4 classes.

Class name and # {'IEEE': 144, 'Macmillan': 2586, 'Springer Nature': 5768, 'Wolters Kluwer Health': 1311}
Num files trained 3200 and validated 800 per epoch
Images resized to 225x150 trained avg 3.0 epochs



## Model

In [127]:
class Model(DataGenerator):
    def __init__(self, network, pooling, optimizer, learn_rate, epochs):
        super(Model, self).__init__(*args)
        self.network = network
        self.lr = learn_rate
        self.opt = optimizer
        self.pool = pooling
        self.epochs = epochs
        
    def compile_model(self):
        base = self.network(include_top=False, weights='imagenet', 
                     input_shape=(self.width, self.height, 3), pooling=self.pool)
        opt = self.opt(learning_rate=self.lr)
        
        x = base.output
        x = layers.Dense(self.class_num, activation='softmax')(x)
        self.model = tf.keras.Model(base.input, x)
        self.model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer=opt)    
              
    def run_model(self, summary, save): 
        self.current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M")
        self.log_dir = f'{self.network.__name__}{self.pool}_{self.opt.__name__}lr{self.lr}_E{self.epochs}B{dat1.batch_size}-{current_time}'
        tensorflow_callback = k.callbacks.TensorBoard(log_dir = f'tensorboard_logs/{self.log_dir}',
                                                      histogram_freq=1, write_graph=True, write_images=True)
        
        history = self.model.fit(self.train_data, validation_data=self.valid_data, 
                            callbacks=[tensorflow_callback], epochs=self.epochs, 
                            steps_per_epoch=self.steps , validation_steps=(DataGenerator.steps * DataGenerator.val_split), 
                            batch_size=self.batch_size, verbose =1)
        if summary == True:
            self.model.summary()
        if save == True:
            self.model.save(f'{NETWORK}_publisherid - {current_time}')


In [128]:
# If the position of objects is important Avg pool if not Max Pooling 
# Arguments: network, pooling, optimizer, learn_rate, epochs
mod1 = Model(EfficientNetB4, 'max', Adam, 8e-5, 30) #xception ~ 5e-4, efficient ~ 9e-5
mod1.compile_model()
# Arguments: Summary, save model
#mod1.run_model(False, True)

TypeError: __init__() missing 2 required positional arguments: 'width' and 'height'

## Tensorboard

In [75]:
class Tensorboard:
    def __init__(self, samples):
        self.samples = samples
        self.data = []
        self.pubs = []
        self.publisher_names = list(dat1.train_data.class_indices.keys())[0:dat1.class_num]
        self.pubs = r.sample(range(0,dat1.class_counter[0]-1), samples) 
        self.pubs.extend(r.sample(range(dat1.class_counter[0],dat1.class_counter[1]+dat1.class_counter[0]-1), samples))
        self.pubs.extend(r.sample(range(dat1.class_counter[0]+dat1.class_counter[1],dat1.class_counter[0]+dat1.class_counter[1]+dat1.class_counter[2]-1), samples))
        self.pubs.extend(r.sample(range(dat1.class_counter[0]+dat1.class_counter[1]+dat1.class_counter[2],dat1.class_counter[0]+dat1.class_counter[1]+dat1.class_counter[2]+dat1.class_counter[3]-1), samples))
        
        for i in pubs:
            img = load_img(dat1.train_data.filepaths[i])
            img = img.resize((150,225)) # width x height
            img_arr = np.asarray(img)
            self.data.append(img_arr)
            
    # Code adapted from Tensorboard tutorial
    def plot_to_image(figure):
      # Save the plot to a PNG in memory.
      buf = io.BytesIO()
      plt.savefig(buf, format='png')
      plt.close(figure)
      buf.seek(0)
      # Convert PNG buffer to TF image
      image = tf.image.decode_png(buf.getvalue(), channels=4)
      # Add the batch dimension
      image = tf.expand_dims(image, 0)
      return image

    def log_images(self):
        logdir = f'tensorboard_logs/{mod1.log_dir}/image'
        file_writer = tf.summary.create_file_writer(logdir)
        # Data should be in (BATCH_SIZE, H, W, C) 
        assert np.size(np.shape(self.data)) == 4
      # Create a figure to contain the plot.
        figure = plt.figure(figsize=(10,10))
        num_images = np.shape(data)[0]
        size = int(np.ceil(np.sqrt(num_images)))

        for i in range(len(pubs)):
            # Start next subplot.
            plt.subplot(size, size, i + 1, title=class_names[labels[pubs[i]]])
            plt.xticks([])
            plt.yticks([])
            plt.grid(False)
            #plt.imshow(data[i], cmap=plt.cm.binary)
        return figure

        with file_writer.as_default():        
                tf.summary.image(f'{len(pubs)} examples of training data', self.plot_to_image(figure), max_outputs=len(pubs), step=0)



In [76]:
ten1 = Tensorboard(3)
ten1.log_images()


AttributeError: 'BuildModel' object has no attribute 'log_dir'

## Predictor

In [None]:
class Predictor:
    def randimg_predict(self):
        n = r.randint(0, dat1.valid_data.n)
        filenames = dat1.valid_data.filenames
        path = f'{PATH_TRAIN}{filenames[n]}'
        pic = mpimg.imread(path)
        plt.axis('off')
        plt.imshow(pic)
        plt.show()

        img = tf.keras.preprocessing.image.load_img(path, target_size=(dat1.width, dat1.height))
        img_array = tf.keras.preprocessing.image.img_to_array(img)
        img_batch = np.expand_dims(img_array, axis=0)
        img_processed = tf.keras.applications.xception.preprocess_input(img_batch)

        prediction = mod1.model(img_processed, training=False)
        Top_index = np.argsort(np.max(prediction, axis=0))[-1]
        Second_index = np.argsort(np.max(prediction, axis=0))[-2]

        sort = np.sort(max(prediction))
        print(f'1st predict {dat1.class_names[Top_index]} with conf {round(sort[len(sort) - 1]*100)}%')
        print(f'2nd predict {dat1.class_names[Second_index]} with conf {round(sort[len(sort) - 2] * 100)}%')           
        print(f'Answer is {filenames[n][:]}')
        
    def batch_predict(self, steps):
        self.steps = steps
        mod1.model.evaluate(dat1.valid_data, batch_size=dat1.batch_size, steps=self.steps, return_dict=True)


In [None]:
predict1 = Predictor()
predict1.randimg_predict()
predict1.batch_predict(10)