In [2]:
import numpy as np
import os
import cv2
import random
import tensorflow as tf
from pathlib import Path
from tensorflow import keras
from tensorflow.keras import applications, losses, optimizers, metrics, Model
from tensorflow.keras.layers import Layer, Input, Dense, Flatten, Lambda, Conv2D, MaxPooling2D
from tensorflow.keras.applications import VGG16, vgg16
from matplotlib import gridspec
from matplotlib.image import imread
import matplotlib.pyplot as plt
from tensorflow.keras.applications.vgg16 import preprocess_input


In [3]:
# ratio to resize to with cv2.resize(), needs to be (270, 480) if resizing is done with tf.keras.preprocessing.image.load_img()
IMG_SIZE = (224, 224) 

# number of directories used for benign anchor/positive images because we don't need thousands of them
DIR_SIZE = 21

# limit for the number of images read per directory to prevent OOM
COUNT_PER_DIR = 300

# limit for the number of triplets to read per triplet category to prevent OOM
NUM_TRIPLETS = 300

# number of epochs to run per training iteration
EPOCHS = 5

In [13]:
class PreProcessing:

    # training images, labels, and unique labels
    images_train = np.array([])
    labels_train = np.array([])
    unique_train_label = np.array([])

    # map cluster labels to training image indices belonging to that cluster
    map_train_label_indices = dict()

    def __init__(self, data_src, benign_anc_path, benign_pos_path, benign_neg_path):
        self.data_src = data_src
        
        # path to the anchor/positive negative images
        self.benign_anc_path = benign_anc_path
        self.benign_pos_path = benign_pos_path
        self.benign_neg_path = benign_neg_path
        
        print("Loading the Dataset...")
        self.images_train, self.labels_train = self.preprocessing()
        self.unique_train_label = np.unique(self.labels_train)
        self.map_train_label_indices = {label: np.flatnonzero(self.labels_train == label) for label in
                                        self.unique_train_label}
        
        # starting idxs for benign anchor/pos/neg directories
        # needed when getting triplets to locate the respective directories
        self.benign_anc_start = len(os.listdir(self.data_src)) # ex. 23 
        self.benign_pos_start = self.benign_anc_start + DIR_SIZE # ex. 23 + 500 = 523
        self.benign_neg_start = self.benign_pos_start + DIR_SIZE # ex. 523 + 500 = 1023

        print('Preprocessing Done. Summary:')
        print("Images train :", self.images_train.shape)
        print("Labels train :", self.labels_train.shape)
        print("Unique label :", self.unique_train_label)

    def read_dataset(self):
        
        # count how many images we are reading prior so that we don't iteratively append to the array 
        count = 0
        directories = os.listdir(self.data_src)
        
        # without this, the directories will be ordered as such: "1", "100", "2", ...
        
        # add all counts of images in each of the malicious page screenshot directories
        for directory in directories:
            count += len([file for file in os.listdir(os.path.join(self.data_src, directory))])

        # increment count by 1 per directory since only 1 anchor image is used per directory
        benign_anc_dir = os.listdir(self.benign_anc_path)
        for b_dir in benign_anc_dir[:DIR_SIZE]:
            count += 1

        # 30 images are selected per positive directory (could be adjusted per need)
        benign_pos_dir = os.listdir(self.benign_pos_path)
        for b_dir in benign_pos_dir[:DIR_SIZE]:
            count += COUNT_PER_DIR

        # all benign images from the single negative directory is read
        benign_neg_dir = os.listdir(self.benign_neg_path)
        for b_dir in benign_neg_dir:
            count += len([file for file in os.listdir(os.path.join(self.benign_neg_path, b_dir))])

        X = [None] * count
        y = [None] * count
        idx = 0

        # read malicious images
        for directory in directories:
            try:
                print('Read directory: ', directory)
                for pic in os.listdir(os.path.join(self.data_src, directory)):
                    img = tf.keras.preprocessing.image.load_img(os.path.join(self.data_src, directory, pic),interpolation='bilinear')
                    img = tf.keras.preprocessing.image.img_to_array(img)
                    img = cv2.resize(img, IMG_SIZE)

                    X[idx] = preprocess_input(img)                    
                    y[idx] = directory
                    idx += 1

            except Exception as e:
                print('Failed to read images from Directory: ', directory)
                print('Exception Message: ', e)
                
        # read benign anchor images
        for b_dir in benign_anc_dir[:DIR_SIZE]:
            try:
                print('Read benign anchor directory: ', b_dir)
                for pic in os.listdir(os.path.join(self.benign_anc_path, b_dir)):
                    img = tf.keras.preprocessing.image.load_img(os.path.join(self.benign_anc_path, b_dir, pic),interpolation='bilinear')
                    img = tf.keras.preprocessing.image.img_to_array(img)
                    img = cv2.resize(img, IMG_SIZE)

                    X[idx] = preprocess_input(img)
                    # 10,000 offset to prevent labels from overlapping
                    y[idx] = int(b_dir) + 10000
                    idx += 1

            except Exception as e:
                print('Failed to read images from Directory: ', directory)
                print('Exception Message: ', e)
                
        # read benign positive images (augmented)
        for b_dir in benign_pos_dir[:DIR_SIZE]:
            try:
                print('Read benign positive directory: ', b_dir)
                for pic in os.listdir(os.path.join(self.benign_pos_path, b_dir))[:30]:
                    img = tf.keras.preprocessing.image.load_img(os.path.join(self.benign_pos_path, b_dir, pic),interpolation='bilinear')
                    img = tf.keras.preprocessing.image.img_to_array(img)
                    img = cv2.resize(img, IMG_SIZE)

                    X[idx] = preprocess_input(img)
                    # 20,000 offset to prevent labels from overlapping
                    y[idx] = int(b_dir) + 20000
                    idx += 1

            except Exception as e:
                print('Failed to read images from Directory: ', directory)
                print('Exception Message: ', e)
        
        # read benign negative images
        for b_dir in benign_neg_dir:
            try:
                print('Read benign negative directory: ', b_dir)
                for pic in os.listdir(os.path.join(self.benign_neg_path, b_dir)):
                    img = tf.keras.preprocessing.image.load_img(os.path.join(self.benign_neg_path, b_dir, pic),interpolation='bilinear')
                    img = tf.keras.preprocessing.image.img_to_array(img)
                    img = cv2.resize(img, IMG_SIZE)

                    X[idx] = preprocess_input(img)
                    # 30,000 offset to prevent labels from overlapping
                    y[idx] = int(b_dir) + 30000
                    idx += 1

            except Exception as e:
                print('Failed to read images from Directory: ', b_dir)
                print('Exception Message: ', e)
                             
        print('Dataset loaded successfully.')
        return X,y    
    def preprocessing(self):
        X, y = self.read_dataset()

        labels = list(set(y))
        
        # map random directory numbers to 0, 1, 2, ... n-1 and store the mapped labels in Y
        label_dict = dict(zip(labels, range(len(labels))))
        Y = np.asarray([label_dict[label] for label in y])        

        # shuffle the images and labels (in the same order)
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x_shuffled = []
        y_shuffled = []
        
        for index in shuffle_indices:
            x_shuffled.append(X[index])
            y_shuffled.append(Y[index])
        
        return np.asarray(x_shuffled), np.asarray(y_shuffled)

    # (mal, mal, mal) triplet
    def get_triplets(self):
        # include only the malicious cluster labels (label 0 - 23)
        exclude_benign = self.unique_train_label[:self.benign_anc_start]
        
        # label_l is the cluster label from which we choose 2 indices (of anchor & positive training images) from
        # label_r is another cluster label from which we choose 1 index (of negative training image) from
        label_l, label_r = np.random.choice(exclude_benign, 2, replace=False)
        
        # a = anchor, p = positive, n = negative index
        a, p = np.random.choice(self.map_train_label_indices[label_l],2, replace=False)
        n = np.random.choice(self.map_train_label_indices[label_r])
        return a, p, n

    # (mal, mal, benign) triplet
    def get_benign_triplets_1(self):
        # get a random malicious cluster label (from label 0 - 22)
        label_l = np.random.choice(self.unique_train_label[:self.benign_anc_start])

        # get a benign anchor image cluster label (from label 23 - 522)
        label_r = np.random.choice(self.unique_train_label[self.benign_anc_start:self.benign_pos_start])
        
        a, p = np.random.choice(self.map_train_label_indices[label_l],2, replace=False)
        n = np.random.choice(self.map_train_label_indices[label_r])
        
        return a, p, n
    
    # (benign, benign, mal) triplet
    def get_benign_triplets_2(self):
        # get a random anchor benign image cluster label (from label 23 - 522)
        label_a = np.random.choice(self.unique_train_label[self.benign_anc_start:self.benign_pos_start])
        # get a corresponding benign positive image cluster label (from label 523 - 1022)
        label_p = label_a + DIR_SIZE
        # get a malicious image cluster label (from label 0 - 22)
        label_n = np.random.choice(self.unique_train_label[self.benign_anc_start])

        a = np.random.choice(self.map_train_label_indices[label_a])
        p = np.random.choice(self.map_train_label_indices[label_p])
        n = np.random.choice(self.map_train_label_indices[label_n])
        
        return a, p, n        
    
    # (benign, benign, benign) triplet
    def get_benign_triplets_3(self):
        # get a random anchor benign image cluster label (from label 23 - 522)
        label_a = np.random.choice(self.unique_train_label[self.benign_anc_start:self.benign_pos_start])
        # get a corresponding benign positive image cluster label (from label 523 - 1022)
        label_p = label_a + DIR_SIZE
        # get a benign negative image cluster label (from label 1023)
        label_n = self.unique_train_label[self.benign_neg_start]

        a = np.random.choice(self.map_train_label_indices[label_a])
        p = np.random.choice(self.map_train_label_indices[label_p])
        n = np.random.choice(self.map_train_label_indices[label_n])
        
        return a, p, n

    def get_triplets_batch(self):
        idxs_a, idxs_p, idxs_n = [], [], []
        
        # generate 300 triplets per triplet combination (to prevent OOM)
        # (mal, mal, mal) 
        for i in range(NUM_TRIPLETS):
            a, p, n = self.get_triplets()
            idxs_a.append(a)
            idxs_p.append(p)
            idxs_n.append(n)
        
        # (mal, mal, benign)
        for _ in range(NUM_TRIPLETS):
            a, p, n = self.get_benign_triplets_1()
            idxs_a.append(a)
            idxs_p.append(p)
            idxs_n.append(n)
            
        # (benign, benign, mal)
        for _ in range(NUM_TRIPLETS):
            a, p, n = self.get_benign_triplets_2()
            idxs_a.append(a)
            idxs_p.append(p)
            idxs_n.append(n)
            
        # (benign, benign, benign)
        for _ in range(NUM_TRIPLETS):
            a, p, n = self.get_benign_triplets_3()
            idxs_a.append(a)
            idxs_p.append(p)
            idxs_n.append(n)
            
        # shuffling to prevent bias because val_dataset is later taken from the last 20% of the dataset
        shuffle_indices = np.random.permutation(np.arange(NUM_TRIPLETS * 4))
        a_shuffled = []
        p_shuffled = []
        n_shuffled = []
        
        for index in shuffle_indices:
            a_shuffled.append(idxs_a[index])
            p_shuffled.append(idxs_p[index])
            n_shuffled.append(idxs_n[index])

        anchor_dataset = tf.data.Dataset.from_tensor_slices(self.images_train[a_shuffled,:])
        positive_dataset = tf.data.Dataset.from_tensor_slices(self.images_train[p_shuffled,:])
        negative_dataset = tf.data.Dataset.from_tensor_slices(self.images_train[n_shuffled, :])

        dataset = tf.data.Dataset.zip((anchor_dataset, positive_dataset, negative_dataset))
        
        return dataset

In [16]:
dataset = PreProcessing('./data/train', 
                        './data/anchors', 
                        './data/positive', 
                        './data/train')


Loading the Dataset...
Read directory:  alert
Read directory:  button
Read directory:  card
Read directory:  checkbox_checked
Read directory:  checkbox_unchecked
Read directory:  chip
Read directory:  data_table
Read directory:  dropdown_menu
Read directory:  floating_action_button
Read directory:  grid_list
Read directory:  image
Read directory:  label
Read directory:  menu
Read directory:  radio_button_checked
Read directory:  radio_button_unchecked
Read directory:  slider
Read directory:  switch_disabled
Read directory:  switch_enabled
Read directory:  text_area
Read directory:  text_field
Read directory:  tooltip
Read benign negative directory:  alert
Failed to read images from Directory:  alert
Exception Message:  invalid literal for int() with base 10: 'alert'
Read benign negative directory:  button
Failed to read images from Directory:  button
Exception Message:  invalid literal for int() with base 10: 'button'
Read benign negative directory:  card
Failed to read images from Dir

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (30798,) + inhomogeneous part.

In [15]:
data = dataset.get_triplets_batch() # returns a zip dataset


ValueError: 'a' cannot be empty unless no samples are taken