### divide the data into training, validation and test

In [7]:
# import the necessary packages
import os
# initialize the path to the *original* input directory of images
ORIG_INPUT_DATASET = "malaria/cell_images"
# initialize the base path to the *new* directory that will contain
# our images after computing the training and testing split
BASE_PATH = "malaria"
# derive the training, validation, and testing directories
TRAIN_PATH = os.path.sep.join([BASE_PATH, "training"])
VAL_PATH = os.path.sep.join([BASE_PATH, "validation"])
TEST_PATH = os.path.sep.join([BASE_PATH, "testing"])
# define the amount of data that will be used training
TRAIN_SPLIT = 0.8
# the amount of validation data will be a percentage of the
# *training* data
VAL_SPLIT = 0.1

In [10]:
# import the necessary packages
#from pyimagesearch import config
from imutils import paths
import random
import shutil
import os
# grab the paths to all input images in the original input directory
# and shuffle them
imagePaths = list(paths.list_images(ORIG_INPUT_DATASET))
random.seed(42)
random.shuffle(imagePaths)

In [11]:
# compute the training and testing split
i = int(len(imagePaths) * TRAIN_SPLIT)
trainPaths = imagePaths[:i]
testPaths = imagePaths[i:]
# we'll be using part of the training data for validation
i = int(len(trainPaths) * VAL_SPLIT)
valPaths = trainPaths[:i]
trainPaths = trainPaths[i:]

In [13]:
# define the datasets that we'll be building
datasets = [
	("training", trainPaths, TRAIN_PATH),
	("validation", valPaths, VAL_PATH),
	("testing", testPaths, TEST_PATH)
]

In [14]:
# loop over the datasets
for (dType, imagePaths, baseOutput) in datasets:
	# show which data split we are creating
	print("[INFO] building '{}' split".format(dType))
	# if the output base output directory does not exist, create it
	if not os.path.exists(baseOutput):
		print("[INFO] 'creating {}' directory".format(baseOutput))
		os.makedirs(baseOutput)
	# loop over the input image paths
	for inputPath in imagePaths:
		# extract the filename of the input image along with its
		# corresponding class label
		filename = inputPath.split(os.path.sep)[-1]
		label = inputPath.split(os.path.sep)[-2]
		# build the path to the label directory
		labelPath = os.path.sep.join([baseOutput, label])
		# if the label output directory does not exist, create it
		if not os.path.exists(labelPath):
			print("[INFO] 'creating {}' directory".format(labelPath))
			os.makedirs(labelPath)
		# construct the path to the destination image and then copy
		# the image itself
		p = os.path.sep.join([labelPath, filename])
		shutil.copy2(inputPath, p)

[INFO] building 'training' split
[INFO] 'creating malaria\training' directory
[INFO] 'creating malaria\training\Parasitized' directory
[INFO] 'creating malaria\training\Uninfected' directory
[INFO] building 'validation' split
[INFO] 'creating malaria\validation' directory
[INFO] 'creating malaria\validation\Parasitized' directory
[INFO] 'creating malaria\validation\Uninfected' directory
[INFO] building 'testing' split
[INFO] 'creating malaria\testing' directory
[INFO] 'creating malaria\testing\Parasitized' directory
[INFO] 'creating malaria\testing\Uninfected' directory


### Feature Extraction

In [20]:
from tensorflow.keras.applications import VGG16

conv_base = VGG16(weights = "imagenet",
include_top = False)
conv_base.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 block1_conv1 (Conv2D)       (None, None, None, 64)    1792      
                                                                 
 block1_conv2 (Conv2D)       (None, None, None, 64)    36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, None, None, 64)    0         
                                                                 
 block2_conv1 (Conv2D)       (None, None, None, 128)   73856     
                                                                 
 block2_conv2 (Conv2D)       (None, None, None, 128)   147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, None, None, 128)   0     

In [19]:
from keras.preprocessing.image import ImageDataGenerator
datagen = ImageDataGenerator(rescale=1 / 255.0)

In [64]:
height=64
width=64
channels=3 
batch_size=32
seed = 1234

datagen = ImageDataGenerator(rotation_range = 30, 
                             rescale=1. / 255,  
                             shear_range=0.2,  
                             zoom_range=0.1, 
                             horizontal_flip=True,
                             fill_mode="nearest",  
                            )


train_data = datagen.flow_from_directory(directory=TRAIN_PATH, 
                                         target_size  = (height,width), 
                                         batch_size=batch_size, 
                                         seed=seed,  
                                         class_mode="binary", 
                                        )

test_datagen = ImageDataGenerator(rescale=1./255)  
test_data = test_datagen.flow_from_directory(directory=TEST_PATH, 
                                             target_size=(height,width), 
                                             batch_size=batch_size,
                                             seed=seed,
                                             class_mode="binary")

val_data = test_datagen.flow_from_directory(directory=VAL_PATH, 
                                             target_size=(height,width), 
                                             batch_size=batch_size,
                                             seed=seed,
                                             class_mode="binary")

Found 19842 images belonging to 2 classes.
Found 5512 images belonging to 2 classes.
Found 2204 images belonging to 2 classes.


In [65]:
base_model2 = VGG16(weights='imagenet',
                         include_top=False, 
                        )
base_model2.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 block1_conv1 (Conv2D)       (None, None, None, 64)    1792      
                                                                 
 block1_conv2 (Conv2D)       (None, None, None, 64)    36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, None, None, 64)    0         
                                                                 
 block2_conv1 (Conv2D)       (None, None, None, 128)   73856     
                                                                 
 block2_conv2 (Conv2D)       (None, None, None, 128)   147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, None, None, 128)   0     

In [74]:
def extract_features(base_model,sample_count, datagen):
    start = time()
    features =  np.zeros(shape=(sample_count, 2, 2, 512))
    labels = np.zeros(shape=(sample_count))
    generator = datagen
    batch_size = generator.batch_size
    #print(batch_size)
    i = 0
    for inputs_batch,labels_batch in generator:
        stop = time()
        times = stop - start
        print('\r',
              'Extracting features from batch', str(i+1), '/', len(datagen),
              '-- run time:', times,'seconds',
              end='')
        
        features_batch = base_model.predict(inputs_batch)
        features[i * batch_size : (i + 1) * batch_size] = features_batch
        labels[i * batch_size : (i + 1) * batch_size] = labels_batch
        i += 1
        
        if i * batch_size >= sample_count:
            break
            
    print("\n")
    return features,labels

In [78]:
from time import time
import numpy as np
train_features, train_labels = extract_features(base_model2,19842, train_data)
test_features, test_labels = extract_features(base_model2,5512, test_data)

 Extracting features from batch 621 / 621 -- run time: 225.8938112258911 secondss



In [76]:
val_features, val_labels = extract_features(base_model2, 2204, val_data)

 Extracting features from batch 69 / 69 -- run time: 21.61341094970703 secondss



### Save the features

In [103]:
np.save('train_features', train_features)
np.save('test_features', test_features)
np.save('val_features', val_features)

### Load the features

In [104]:
train_features = np.load('train_features.npy')
test_features = np.load('test_features.npy') 
validation_features = np.load('val_features.npy')

In [113]:
train_features = np.reshape(train_features, (19842, 2 * 2 * 512))
test_features = np.reshape(test_features, (5512, 2 * 2 * 512))
val_features = np.reshape(validation_features, (2204, 2 * 2 * 512))

### Refactor the top classification layers of VGG16

In [98]:
from keras import models
from keras import layers
from keras import optimizers
model = models.Sequential()
model.add(layers.Dense(256, activation = "relu", input_dim = (2*2*512)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(256, activation = "relu"))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 256)               524544    
                                                                 
 dropout_17 (Dropout)        (None, 256)               0         
                                                                 
 dense_25 (Dense)            (None, 256)               65792     
                                                                 
 dropout_18 (Dropout)        (None, 256)               0         
                                                                 
 dense_26 (Dense)            (None, 256)               65792     
                                                                 
 dense_27 (Dense)            (None, 1)                 257       
                                                                 
Total params: 656,385
Trainable params: 656,385
Non-t

In [114]:
from tensorflow.keras.optimizers import SGD
model.compile(SGD(learning_rate=0.01),loss="binary_crossentropy", metrics=["accuracy"])

model_fit = model.fit(train_features, train_labels,batch_size=4,
                      validation_data=(val_features, val_labels),  
                      epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [115]:
model.evaluate(test_features,test_labels) 



[0.17743372917175293, 0.9399492144584656]

In [None]:
### The model accuracy is 93.99% on the test dataset.