In [57]:
import tensorflow as tf
import keras as k
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random as r
import numpy as np
import cv2
import imghdr
import os
import datetime
import PIL
import io
import sklearn
import itertools
import shutil

from keras import backend
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.optimizers import RMSprop, Adam, SGD
from tensorflow.keras.applications import ResNet50V2, Xception, EfficientNetB3, EfficientNetB4, EfficientNetB5
from tensorflow.keras import layers
from tensorflow.keras.models import Model, load_model
from collections import Counter
from PIL import ImageFile, Image, ImageOps
from sklearn.metrics import confusion_matrix
from tensorboard.plugins import projector
import tensorflow_datasets

ImageFile.LOAD_TRUNCATED_IMAGES = True
print(f'Tensorflow version {tf.version.VERSION}')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)


Tensorflow version 2.5.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Data processing

In [58]:
PATH_TRAIN = 'C:/Users/crims/Tensorflow/PublisherIdetification/realpublishers/training/'
PATH_TEST = 'C:/Users/crims/Tensorflow/PublisherIdetification/realpublishers/testing/'

class DataGenerator:
    def __init__(self, val_split, width, height, batch_size, steps):
        self.val_split = val_split
        self.width = width
        self.height = height
        self.batch_size = batch_size
        self.steps = steps
        
    def train_generator(self):    
        train_generator = ImageDataGenerator(rescale=1./255, validation_split=self.val_split)
        print('Training folder:')
        self.train_data = train_generator.flow_from_directory(PATH_TRAIN, target_size=(self.width, self.height),
                                                     class_mode = 'categorical', batch_size=self.batch_size, 
                                                     subset = 'training')
        print('Validation folder:')
        self.valid_data = train_generator.flow_from_directory(PATH_TRAIN, target_size=(self.width, self.height),
                                                     class_mode = 'categorical', batch_size=self.batch_size, 
                                                     subset = 'validation')
        print()

    def test_generator(self):
        test_generator = ImageDataGenerator(rescale=1./255)
        print('Test folder:')
        self.test_data = test_generator.flow_from_directory(PATH_TEST, target_size=(self.width, self.height), 
                                                   class_mode=None, batch_size=1, shuffle=False)
        print()
        
    def generator_info(self):
        # Print info about the generated data
        self.class_num = self.train_data.num_classes
        self.class_counter = list(Counter(self.train_data.classes).values())
        self.class_counter_valid = list(Counter(self.valid_data.classes).values())
        self.class_names = list(self.train_data.class_indices)
        self.labels = self.train_data.labels
        print(f'Train data class name and num {dict(zip(self.class_names, self.class_counter))}')
        print(f'Valid data class name and num {dict(zip(self.class_names, self.class_counter_valid))}')
        print(f'Num files trained {self.batch_size * self.steps} and validated {(self.batch_size * self.steps * self.val_split):.0f} per epoch')
        print(f'Images resized to {self.height}x{self.width} trained avg {(self.train_data.n  // (self.batch_size * self.steps)):.1f} epochs' )
        print()
        
    def clean_data(self):
        # Remove nontype files from data folder (run once EVER for data)
        filenames = self.train_data.filenames
        n = 0
        while n < self.train_data.n:
            path = f'{PATH_TRAIN}{filenames[n]}'
            image = cv2.imread(path)
            img_type = imghdr.what(path)
            if img_type != "jpeg":
                print(f'Removing image from {path}')
                os.remove(path)
                n += 1
            else:
                n += 1
        print('All done!') 

## Tensorboard

In [59]:
class Tensorboard:
    #Code from Tensorflow tutorial Tensorboard
    @staticmethod
    def plot_to_image(figure):
      # Save the plot to a PNG in memory.
      buf = io.BytesIO()
      plt.savefig(buf, format='png')
      plt.close(figure)
      buf.seek(0)
      # Convert PNG buffer to TF image
      image = tf.image.decode_png(buf.getvalue(), channels=4)
      # Add the batch dimension
      image = tf.expand_dims(image, 0)
      return image
    
    @staticmethod           
    def plot_confusion_matrix(cm, class_names):
        # Normalize the confusion matrix.
        cm = np.around(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], decimals=2)
        # Use white text if squares are dark; otherwise black.
        threshold = cm.max() / 2.
        figure = plt.figure(figsize=(10, 10))
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Greens, vmin=0, vmax=1)
        plt.title("Confusion matrix")
        plt.colorbar()
        tick_marks = np.arange(len(class_names))
        plt.xticks(tick_marks, class_names, rotation=45)
        plt.yticks(tick_marks, class_names)

        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            color = "white" if cm[i, j] > threshold else "black"
            plt.text(j, i, cm[i, j], horizontalalignment="center", color=color)

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        return figure
    
    # Code from https://gist.github.com/AndrewBMartin/ab06f4708124ccb4cacc4b158c3cef12
    @staticmethod           
    def create_sprite(data):
        # For B&W or greyscale images
        if len(data.shape) == 3:
            data = np.tile(data[..., np.newaxis], (1, 1, 1, 3))

        n = int(np.ceil(np.sqrt(data.shape[0])))
        padding = ((0, n ** 2 - data.shape[0]), (0, 0), (0, 0), (0, 0))
        data = np.pad(data, padding, mode="constant", constant_values=0)

        # Tile images into sprite
        data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3, 4))
        # print(data.shape) => (n, image_height, n, image_width, 3)

        data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
        # print(data.shape) => (n * image_height, n * image_width, 3)
        return data
    
    @staticmethod
    def plot_to_projector(x, feature_vector, y, class_names, 
                          log_dir="tensorboard_logs/projector", meta_file="metadata.tsv",):
            
        assert x.ndim == 4  # (BATCH, H, W, C)
        if os.path.isdir(log_dir):
            shutil.rmtree(log_dir)

        # Create a new clean fresh folder :)
        os.mkdir(log_dir)
        SPRITES_FILE = os.path.join(log_dir, "sprites.png")
        sprite = Tensorboard.create_sprite(x)
        cv2.imwrite(SPRITES_FILE, sprite)

        # Generate label names
        labels = [class_names[list(y[i]).index(1)] for i in range(int(y.shape[0]))]

        with open(os.path.join(log_dir, meta_file), "w") as f:
            for label in labels:
                f.write("{}\n".format(label))

        if feature_vector.ndim != 2:
            print(
                "NOTE: Feature vector is not of form (BATCH, FEATURES)"
                " reshaping to try and get it to this form!"
            )
            feature_vector = tf.reshape(feature_vector, [feature_vector.shape[0], -1])

            feature_vector = tf.Variable(feature_vector)
            checkpoint = tf.train.Checkpoint(embedding=feature_vector)
            checkpoint.save(os.path.join(log_dir, "embeddings.ckpt"))

            # Set up config
            config = projector.ProjectorConfig()
            embedding = config.embeddings.add()
            embedding.tensor_name = "embedding_file"
            embedding.metadata_path = meta_file
            embedding.sprite.image_path = "sprites.png"
            embedding.sprite.single_image_dim.extend((x.shape[1], x.shape[2]))
            projector.visualize_embeddings(log_dir, config)

## Model

In [60]:
class BuildModel(DataGenerator):
    def __init__(self, data_generator, network, pooling, optimizer, learn_rate, epochs, samples):
        self.width = data_generator.width
        self.height = data_generator.height
        self.class_num = data_generator.class_num
        self.class_names = data_generator.class_names
        self.steps = data_generator.steps
        self.val_steps = data_generator.steps *  data_generator.val_split
        self.batch_size = data_generator.batch_size
        self.train_data = data_generator.train_data
        self.valid_data = data_generator.valid_data
        self.test_data = data_generator.test_data
        
        self.network = network
        self.lr = learn_rate
        self.opt = optimizer
        self.pool = pooling
        self.epochs = epochs
        self.samples = samples
        self.publisher_names = list(self.train_data.class_indices.keys())[0:self.class_num]
        
        # Create labels list
        self.test_num = self.test_data.n / self.class_num
        self.test_labels = np.repeat(list(range(0,(self.class_num))), self.test_num)
        
    def compile_model(self):
        base = self.network(include_top=False, weights='imagenet', 
                     input_shape=(self.width, self.height, 3), pooling=self.pool)
        opt = self.opt(learning_rate=self.lr)  
        x = base.output
        #x = layers.Dense(512, activation= 'relu')(x)
        x = layers.Dense(self.class_num, activation='softmax')(x)
        self.model = tf.keras.Model(base.input, x)
        self.model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer=opt)
    
    def run_model(self, summary, proj, save): 
        if summary == True:
            self.model.summary()
            
        self.current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M")
        self.log_dir = f'{self.network.__name__}{self.pool}_{self.opt.__name__}lr{self.lr}_E{self.epochs}B{self.batch_size}-{self.current_time}'
        
        cm_callback = k.callbacks.LambdaCallback(on_epoch_end= self.log_confusion_matrix)
        image_callback = k.callbacks.LambdaCallback(on_epoch_end= self.log_images)
        scalar_callback = k.callbacks.TensorBoard(log_dir = f'tensorboard_logs/{self.log_dir}',
                                                      update_freq=100, histogram_freq=1, profile_batch=(40,50),
                                                      write_graph=True, write_images=True)
        
        self.file_writer_cm = tf.summary.create_file_writer(f'tensorboard_logs/{self.log_dir}/cm')
        self.file_writer_image = tf.summary.create_file_writer(f'tensorboard_logs/{self.log_dir}/image')

        self.model.fit(self.train_data, validation_data= self.valid_data, 
                       callbacks=[scalar_callback, cm_callback, image_callback], epochs=self.epochs, 
                       steps_per_epoch=self.steps , validation_steps=self.val_steps, 
                       batch_size=self.batch_size, verbose=1)
        
        if proj == True:
            LOG_DIR = 'tensorboard_logs/projector/test'
            src = 'metadata.tsv'
            dst = os.path.join(log_dir, 'metadata.tsv')
            copyfile(src,dst)
            
            feature_vectors = np.lad
            
        if save == True:
            self.model.save(f'{self.network.__name__}_publisherid - {self.current_time}')
            
    def log_confusion_matrix(self, epoch, logs):  
        # Use the model to predict the values from the test_images.
        test_pred_raw = self.model.predict(self.test_data)
        test_pred = np.argmax(test_pred_raw, axis=1)

        # Calculate the confusion matrix using sklearn.metrics
        cm = sklearn.metrics.confusion_matrix(self.test_labels, test_pred)
        figure = Tensorboard.plot_confusion_matrix(cm, class_names=self.class_names)
        cm_image = Tensorboard.plot_to_image(figure)

        # Log the confusion matrix as an image summary.
        with self.file_writer_cm.as_default():
            tf.summary.image(f'{self.network.__name__}-{self.current_time}', cm_image, step=epoch)
            
    def log_images(self, epoch, logs):
        # Create pubs list
        self.pubs = list(r.sample(range(0,49), self.samples)) 
        self.pubs.extend(r.sample(range(50, 99), self.samples))
        self.pubs.extend(r.sample(range(100, 149), self.samples))
        self.pubs.extend(r.sample(range(150, 199), self.samples))
        # Create image paths from test_data
        self.data = []
        for i in self.pubs:
            img = load_img(self.test_data.filepaths[i])
            img = img.resize((self.width, self.height)) # width x height
            img_arr = np.asarray(img)
            self.data.append(img_arr)
            
        # Data should be in (BATCH_SIZE, H, W, C) 
        assert np.size(np.shape(self.data)) == 4
        # Create a figure to contain the plot.
        figure = plt.figure(figsize=(10,10))
        num_images = np.shape(self.data)[0]
        size = int(np.ceil(np.sqrt(num_images)))

        for i in range(len(self.pubs)):
            # Start next subplot.
            plt.subplot(size, size, i + 1, title=self.class_names[self.test_labels[self.pubs[i]]])
            plt.xticks([])
            plt.yticks([])
            plt.grid(False)
            plt.imshow(self.data[i], cmap=plt.cm.binary)

        with self.file_writer_image.as_default():        
                tf.summary.image(f'{self.network.__name__}-{self.current_time}', 
                                 Tensorboard.plot_to_image(figure), max_outputs=len(self.pubs), step=epoch)


In [61]:
#Args: valid split, width, height, batch_size, steps
# batch_size = 32 for Xception and 12 for efficientB5, 16 for B4, 24 for B3
dat1 = DataGenerator(0.2, 200, 300, 32, 10)
dat1.train_generator()
dat1.test_generator()
dat1.generator_info()

# If the position of objects is important Avg pool if not Max Pooling 
# Arguments: network, pooling, optimizer, learn_rate, epochs
mod1 = BuildModel(dat1, Xception, 'avg', Adam, 1e-4, 1, 4) #xception ~ 5e-4, efficient ~ 9e-5
mod1.compile_model()
mod1.run_model(False, True, False)

Training folder:
Found 8311 images belonging to 4 classes.
Validation folder:
Found 2076 images belonging to 4 classes.

Test folder:
Found 1000 images belonging to 1 classes.

Train data class name and num {'IEEE': 949, 'Macmillan': 2863, 'Springer Nature': 3261, 'Wolters Kluwer Health': 1238}
Valid data class name and num {'IEEE': 237, 'Macmillan': 715, 'Springer Nature': 815, 'Wolters Kluwer Health': 309}
Num files trained 320 and validated 64 per epoch
Images resized to 300x200 trained avg 25.0 epochs







RuntimeError: v1.summary.FileWriter is not compatible with eager execution. Use `tf.summary.create_file_writer`,or a `with v1.Graph().as_default():` context

## Predictor

In [None]:
class Predictor(BuildModel):
    def __init__(self, build_model):
        self.test_data = build_model.test_data
        self.valid_data = build_model.valid_data
        self.test_labels = build_model.test_labels
        self.width = build_model.width
        self.height = build_model.height
        self.model = build_model.model
        self.class_names = build_model.class_names
        self.batch_size = build_model.batch_size
             
    def img_predict(self):
        n = r.randint(0, self.test_data.n)
        filenames = self.valid_data.filenames
        path = f'{PATH_TRAIN}{filenames[n]}'
        pic = mpimg.imread(path)
        plt.axis('off')
        plt.imshow(pic)
        plt.show()

        img = tf.keras.preprocessing.image.load_img(path, target_size=(self.width, self.height))
        img_array = tf.keras.preprocessing.image.img_to_array(img)
        img_batch = np.expand_dims(img_array, axis=0)
        img_processed = tf.keras.applications.xception.preprocess_input(img_batch)

        prediction = self.model(img_processed, training=False)
        Top_index = np.argsort(np.max(prediction, axis=0))[-1]
        Second_index = np.argsort(np.max(prediction, axis=0))[-2]

        sort = np.sort(max(prediction))
        print(f'1st predict {self.class_names[Top_index]} with conf {round(sort[len(sort) - 1]*100)}%')
        print(f'2nd predict {self.class_names[Second_index]} with conf {round(sort[len(sort) - 2] * 100)}%')           
        print(f'Answer is {filenames[n][:]}')
        
    def batch_evaluate(self, num_batches):
        self.steps = num_batches
        self.model.evaluate(self.valid_data, batch_size=self.batch_size, steps=self.steps)
        
    def batch_predict(self, num_predicts):
        i = num_predicts
        test_pred_raw = self.model.predict(self.test_data)
        test_pred = np.argmax(test_pred_raw, axis=1)
        test_labels = self.test_labels
        acc = sum(1 for x,y in zip(test_pred[0:i], test_labels[0:i]) if x == y) / len(test_labels[0:i])
        print(f'Accuracy of predictions is {acc*100}%')        
        

In [None]:
pred1 = Predictor(mod1)
pred1.img_predict()
pred1.batch_predict(100)
pred1.batch_evaluate(10)