In [1]:
from keras.preprocessing.image import img_to_array
from sklearn.preprocessing import LabelBinarizer
from keras.applications import VGG16

from sklearn.utils import shuffle
from imutils import paths
import numpy as np
import progressbar
import h5py as h5
import imutils
import cv2
import os

Using TensorFlow backend.


# Image Processing

## Variables

In [2]:
images_dir = 'C:/Users/Tajr/Desktop/Data/RadonPlus/RadonTechnology/Dev/Deep Learning/Datasets/computervision/CatDog/All/'
processing_widgets = ['Processing Images: ', progressbar.Percentage(),' ', progressbar.Bar(),' ', progressbar.ETA()]
loading_widgets = ['Loading Images: ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()]

## Helper

## Preprocessor

In [3]:
class Preprocessor:
    # constructor
    def __init__(self, width, height, inter=cv2.INTER_AREA, dataFormat=None):
        self.width = width
        self.height = height
        self.inter = inter
        self.dataFormat = dataFormat
        
    
    # path getter
    def get_images_path(self, images_dir):
        path =list(paths.list_images(images_dir)) 
        return path
    
    
    # images loader
    def load_images_labels(self, images_path):
        images_data = list()
        labels = list()
        
        # start a progress bar
        prog_load = progressbar.ProgressBar(maxval=len(images_path), widgets=loading_widgets).start()
        
        # load, preprocess images and extract labels
        for (i, path) in enumerate(images_path):
            single_image = list()
            image = cv2.imread(path)
            image = self.resize(image)
            image = self.img2array(image)
            
            images_data.append(image)
#             single_image.append(image)
#             single_image = np.array(single_image)
#             single_image = single_image.astype('float') / 255.0
            
#             if len(images_data) == 0:
#                 images_data = single_image
#             else:
#                 images_data = np.append(images_data, single_image, axis=0)
            
            #get labels
            image_file = path.split('/')[-1]
            label = image_file.split('.')[0]
            labels.append(label)
            
            # update progress bar
            prog_load.update(i)
        
        # convert images list to numpy array
        images_data = np.array(images_data).astype('float') / 255
        labels = np.array(labels)
        
        # scale pixels intensity
#         images_data = images_data.astype('float') / 255
        
        # finish progress bar
        prog_load.finish()
        
        return (images_data, labels)
    
    # resize image
    def resize(self, image):
        # grab (height,width) and initialize deltas
        (h, w) = image.shape[:2]
        dH = 0
        dW = 0
        
        # resize
        if w < h:
            image = imutils.resize(image, width=self.width, inter=self.inter)
            dH = int((image.shape[0] - self.height) / 2.0)
        else:
            image = imutils.resize(image, height=self.height, inter=self.inter)
            dW = int((image.shape[1] - self.width) / 2.0)
        
        # Cropping
        (h, w) = image.shape[:2]
        image = image[dH:h - dH, dW:w - dW]
        
        #resize back to a given spatial dimension
        return cv2.resize(image, (self.width, self.height), interpolation=self.inter)
    
    #convert image to array
    def img2array(self, image):
        return img_to_array(image, data_format=self.dataFormat)
        

# Dataset

## Initialize Preprocessor

In [4]:
preprocessor = Preprocessor(width=224, height=224)

## Get Images Path

In [5]:
images_path = shuffle(preprocessor.get_images_path(images_dir))

## Load Images and Labels

In [6]:
(images, targets) = preprocessor.load_images_labels(images_path)

Loading Images: 100% |##########################################| Time: 0:08:10


In [7]:
images.shape

(8000, 224, 224, 3)

In [8]:
targets.shape

(8000,)

In [9]:
targets

array(['cat', 'cat', 'dog', ..., 'cat', 'cat', 'cat'], dtype='<U3')

## Labels Binarization

In [10]:
labels = targets

In [11]:
lb = LabelBinarizer()

In [12]:
labels = lb.fit_transform(labels)

In [13]:
labels.shape

(8000, 1)

In [14]:
label_names = lb.classes_
label_names

array(['cat', 'dog'], dtype='<U3')

In [15]:
labels

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

# Dataset Writer

In [16]:
class DatasetWriter:
    # constructor
    def __init__(self, data_dim, labels_dim, outputFile, dataKey='Features', buffer_size=1000):
        
        # check to see if dataset file exists 
        if os.path.exists(outputFile):
            raise ValueError('The supplied file name already exists, choose different name.')
            
        # open the HDF5 database for writing
        self.database = h5.File(outputFile, 'w')
        
        # create a data and labels container
        self.data = self.database.create_dataset(dataKey, data_dim, dtype='float')
        self.labels = self.database.create_dataset('Labels', labels_dim, dtype='int')
        
        # initialize buffer and store the buffer_size along with index
        self.buffer = {'data': [], 'labels': []}
        self.buffer_size = buffer_size
        self.index = 0
    
    # add data to a dataset
    def add(self, data, labels):
        self.buffer['data'].extend(data)
        self.buffer['labels'].extend(labels)
        
        # flush buffer to dataset if the buffer size is reach
        if len(self.buffer['data']) >= self.buffer_size:
            self.flush()
    
    # flush buffer to HDF5 dataset created
    def flush(self):
        # initialize count
        count = self.index + len(self.buffer['data'])

        # store data
        self.data[self.index:count] = self.buffer['data']
        self.labels[self.index:count] = self.buffer['labels']

        # update index
        self.index = count

        # refresh buffer
        self.buffer = {'data': [], 'labels': []}
    
    # deal with class labels
    def storeClassLabels(self, classLabels):
        data_type = h5.special_dtype(vlen=str)
        
        # create labels container and store label names.
        label_names = self.database.create_dataset('label_names', (len(classLabels), ), dtype=data_type)
        label_names[:] = classLabels
    
    # close a database
    def close(self):
        # store remaining data
        if len(self.buffer['data']) > 0:
            self.flush()
        
        # closing
        self.database.close()

# Feature Extraction

## Variables

In [17]:
data_dimension = (len(images), 512 * 7 * 7)
labels_dimension = labels.shape
batch_size = 32

feature_file = 'C:/Users/Tajr/Desktop/Data/RadonPlus/RadonTechnology/Dev/Deep Learning/Datasets/computervision/HDF5/cat_dog_8k.hdf5'


model = VGG16(weights='imagenet', include_top=False)

dataset = DatasetWriter(data_dimension, labels_dimension, feature_file, dataKey='Features', buffer_size=1000)

extraction_widgets = ['Extracting Feature: ', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()]

## Store class labels 

In [18]:
dataset.storeClassLabels(label_names)

## Extract Features

In [19]:
# initialize progress bar
prog_extract = progressbar.ProgressBar(maxval=len(images), widgets=extraction_widgets).start()

# loop in batches
for i in np.arange(0, len(images), batch_size):
    # initialize batch images and labels
    batch_images = images[i:i + batch_size]
    batch_labels = labels[i:i + batch_size]
    
    # predict possible features (extract features)
    features = model.predict(batch_images, batch_size=batch_size)
    
    # flatten features for classifiers
    features = features.reshape((features.shape[0], 512 * 7 * 7))
    
    # store data and labels in an HDF5 dataset
    dataset.add(features, batch_labels)
    prog_extract.update(i)

dataset.close()
prog_extract.finish()

Extracting Feature: 100% |######################################| Time: 0:06:47
