In [1]:
# Import Data Science Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
# from sklearn.model_selection import train_test_split
import shutil
import zipfile

# Tensorflow Libraries
from tensorflow import keras
from tensorflow.keras import layers #,models
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint # Callback 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import Model
from tensorflow.keras.layers.experimental import preprocessing
import tensorflow_hub as hub
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D, BatchNormalization
from tensorflow.keras.preprocessing import image
from skimage.segmentation import mark_boundaries
from sklearn.utils import compute_class_weight 
from keras.applications import VGG16
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model

# System libraries
from pathlib import Path
import os.path
import random

# Visualization Libraries
import matplotlib.cm as cm
import cv2
import seaborn as sns

sns.set_style('darkgrid')

# Metrics
from sklearn.metrics import classification_report, confusion_matrix
import itertools

import lime
from lime import lime_image
from lime import submodular_pick

# Oversampling SMOTE
from imblearn.over_sampling import SMOTE

# Other Models
from joblib import dump, load
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Pickle
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
PATH = os.getcwd()

In [36]:
# Labeled
train_labeled_dir = os.path.join(PATH, 'train')
validation_labeled_dir = os.path.join(PATH, 'valid')
test_labeled_dir = os.path.join(PATH, 'test')

train_hostile_dir = os.path.join(train_labeled_dir, 'hostile_images')
train_nonhostile_dir = os.path.join(train_labeled_dir, 'non_hostile_images')

validation_hostile_dir = os.path.join(validation_labeled_dir, 'hostile_images')
validation_nonhostile_dir = os.path.join(validation_labeled_dir, 'non_hostile_images')

test_hostile_dir = os.path.join(test_labeled_dir, 'hostile_images')
test_nonhostile_dir = os.path.join(test_labeled_dir, 'non_hostile_images')

# Unlabeled
train_dir = os.path.join(PATH, 'train_predict')
valid_dir = os.path.join(PATH, 'valid_predict')
test_dir = os.path.join(PATH, 'test_predict')

In [14]:
num_hostile_tr = len(os.listdir(train_hostile_dir))
num_nonhostile_tr = len(os.listdir(train_nonhostile_dir))
num_hostile_val = len(os.listdir(validation_hostile_dir))
num_nonhostile_val = len(os.listdir(validation_nonhostile_dir))
num_nonhostile_test = len(os.listdir(test_nonhostile_dir))
num_hostile_test = len(os.listdir(test_hostile_dir))
total_train = num_hostile_tr + num_nonhostile_tr
total_val = num_hostile_val + num_nonhostile_val
total_test = num_hostile_test + num_nonhostile_test
print('total training hostile images:', num_hostile_tr)
print('total training non hostile images:', num_nonhostile_tr)
print('total validation hostile images:', num_hostile_val)
print('total validation non hostile images:', num_nonhostile_val)
print('total test hostile images:', num_hostile_test)
print('total test non hostile images:', num_nonhostile_test)
print("--")
print("Total training images:", total_train)
print("Total validation images:", total_val)
print("Total test images:", total_test)

total training hostile images: 400
total training non hostile images: 4501
total validation hostile images: 100
total validation non hostile images: 1126
total test hostile images: 56
total test non hostile images: 626
--
Total training images: 4901
Total validation images: 1226
Total test images: 682


In [37]:
unlabeled_dir = {
    train_dir: [train_hostile_dir, train_nonhostile_dir],
    valid_dir: [validation_hostile_dir, validation_nonhostile_dir],
    test_dir: [test_hostile_dir, test_nonhostile_dir]
}

for unlabeled, labeled_lst in unlabeled_dir.items():
    if os.path.exists(unlabeled):
        shutil.rmtree(unlabeled)
    sub_dir = os.path.join(unlabeled, 'predict')
    os.makedirs(sub_dir, exist_ok=True)
    for labeled in labeled_lst:
        files = os.listdir(labeled)
        dir_lst = labeled.split('\\')
        for index, fileName in enumerate(files):
            shutil.copy(os.path.join(labeled, fileName), os.path.join(sub_dir, f'{dir_lst[-1]}_{fileName}'))

In [5]:
batch_size = 128
epochs = 15
IMG_HEIGHT = 150
IMG_WIDTH = 150

Creating Image Data Generator

In [15]:
def get_image_data_generator():
    return ImageDataGenerator(rescale=1./255,rotation_range = 30, zoom_range = 0.20, 
                            fill_mode = "nearest", shear_range = 0.20, horizontal_flip = True, 
                            width_shift_range = 0.1, height_shift_range = 0.1)

In [16]:
# Generator for training and validation data
train_labeled_image_generator = get_image_data_generator()
validation_labeled_image_generator = get_image_data_generator()
test_labeled_image_generator = get_image_data_generator()
train_labeled_data_gen = train_labeled_image_generator.flow_from_directory(batch_size=batch_size, directory=train_labeled_dir, shuffle=True, target_size=(IMG_HEIGHT, IMG_WIDTH), class_mode='binary')
val_labeled_data_gen = validation_labeled_image_generator.flow_from_directory(batch_size=batch_size,directory=validation_labeled_dir,target_size=(IMG_HEIGHT, IMG_WIDTH),class_mode='binary')
test_labeled_data_gen = test_labeled_image_generator.flow_from_directory(batch_size=batch_size,directory=test_labeled_dir,target_size=(IMG_HEIGHT, IMG_WIDTH),class_mode='binary',shuffle=False)

Found 4901 images belonging to 2 classes.
Found 1226 images belonging to 2 classes.
Found 682 images belonging to 2 classes.


In [38]:
train_image_generator = get_image_data_generator()
validation_image_generator = get_image_data_generator()
test_image_generator = get_image_data_generator()

def get_image_generater(image_data_generator, directory):
    return image_data_generator.flow_from_directory(batch_size=batch_size,directory=directory,target_size=(IMG_HEIGHT, IMG_WIDTH), class_mode=None,shuffle=False)

train_data_gen = get_image_generater(train_image_generator, train_dir)
val_data_gen = get_image_generater(validation_image_generator, valid_dir)
test_data_gen = get_image_generater(test_image_generator, test_dir)
train_data_gen.reset()
val_data_gen.reset()
test_data_gen.reset()

Found 4901 images belonging to 1 classes.
Found 1226 images belonging to 1 classes.
Found 682 images belonging to 1 classes.


In [17]:
class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(train_labeled_data_gen.classes),
                                        y = train_labeled_data_gen.classes                                                    
                                    )
class_weights = dict(zip(np.unique(train_labeled_data_gen.classes), class_weights))
class_weights

{0: 6.12625, 1: 0.5444345700955343}

In [18]:
checkpoint_path = "classification_model_checkpoint"
checkpoint_callback = ModelCheckpoint(checkpoint_path,
                                      save_weights_only=True,
                                      monitor="val_accuracy",
                                      save_best_only=True)

# Setup EarlyStopping callback to stop training if model's val_loss doesn't improve for 3 epochs
early_stopping = EarlyStopping(monitor = "val_loss", # watch the val loss metric
                               patience = 5,
                               restore_best_weights = True) # if val loss decreases for 3 epochs in a row, stop training

Custom CNN Model Training

In [None]:
# import kerastuner as kt

# def build_model(hp):
#     # Initialize sequential API and start building model.
#     model = keras.Sequential()
#     model.add(keras.layers.Flatten(input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)))
    
#     # Tune the number of hidden layers and units in each.
#     # Number of hidden layers: 1 - 5
#     # Number of Units: 32 - 512 with stepsize of 32
#     for i in range(1, hp.Int("num_layers", 2, 5)):
#         model.add(
#             keras.layers.Dense(
#                 units=hp.Int("units_" + str(i), min_value=32, max_value=512, step=32),
#                 activation="relu")
#             )
        
#         # Tune dropout layer with values from 0 - 0.3 with stepsize of 0.1.
#         model.add(keras.layers.Dropout(hp.Float("dropout_" + str(i), 0, 0.3, step=0.1)))
    
#     # Add output layer.
#     model.add(keras.layers.Dense(units=2, activation="softmax"))
    
#     # Tune learning rate for Adam optimizer with values from 0.01, 0.001, or 0.0001
#     hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
    
#     # Define optimizer, loss, and metrics
#     model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
#                   loss=keras.losses.SparseCategoricalCrossentropy(),
#                   metrics=["accuracy"])
    
#     return model

# tuner = kt.Hyperband(build_model,
#                      objective="val_accuracy",
#                      max_epochs=20,
#                      factor=3,
#                      hyperband_iterations=10,
#                      directory="kt_dir",
#                      project_name="kt_hyperband",)

# tuner.search_space_summary()
# stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
# tuner.search(train_labeled_data_gen, epochs=epochs, callbacks=[stop_early], verbose=2, validation_data=val_labeled_data_gen)
# best_hps=tuner.get_best_hyperparameters()[0]

# # Build model
# model_custom = tuner.hypermodel.build(best_hps)

# # Train the hypertuned model
# model_custom.fit(train_labeled_data_gen, epochs=epochs, callbacks=[stop_early], verbose=2, validation_data=val_labeled_data_gen)
# model_custom.save('model_custom.h5')

In [9]:
def get_model3():
    model = Sequential()
    model.add(Conv2D(32, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)))
    model.add(MaxPooling2D()) 
    model.add(Dropout(0.25))
    model.add(Conv2D(64, 3, padding='same', activation='relu')) 
    model.add(MaxPooling2D())
    model.add(Dropout(0.25))
    model.add(Conv2D(128, 3, padding='same', activation='relu')) 
    model.add(Dropout(0.4))

    # Flattening our dimensions
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=['accuracy'])
    model.summary()
    return model

In [20]:
model3 = get_model3()
history3 = model3.fit(
    train_labeled_data_gen,
    epochs=epochs,
    validation_data=val_labeled_data_gen,
    class_weight=class_weights,
    callbacks=[
        early_stopping,
        checkpoint_callback
    ]
    )

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 150, 150, 32)      896       
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 75, 75, 32)       0         
 2D)                                                             
                                                                 
 dropout_4 (Dropout)         (None, 75, 75, 32)        0         
                                                                 
 conv2d_4 (Conv2D)           (None, 75, 75, 64)        18496     
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 37, 37, 64)       0         
 2D)                                                             
                                                                 
 dropout_5 (Dropout)         (None, 37, 37, 64)       

In [34]:
results = model3.evaluate(test_labeled_data_gen, verbose=0)

print("    Test Loss: {:.5f}".format(results[0]))
print("Test Accuracy: {:.2f}%".format(results[1] * 100))

    Test Loss: 0.28830
Test Accuracy: 89.15%


In [103]:
model3.save('model_custom.h5')

In [96]:
def get_actual_labels(filename):
    actual_labels = filename.split('\\')[-1].split('_')[0]
    return 0 if actual_labels == 'hostile' else 1

VGG Model Training

In [39]:
def get_model2():
    model = Sequential()
    model.add(Flatten(input_shape=(4,4,512)))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(2, activation='softmax'))

    # compile the model
    model.compile(optimizer='adam', metrics=['accuracy'], loss='categorical_crossentropy')

    model.summary()
    return model

pretrained_model = VGG16(include_top=False, weights='imagenet')
pretrained_model.summary()
vgg_features_train = pretrained_model.predict_generator(train_data_gen)
vgg_features_val = pretrained_model.predict_generator(val_data_gen)
vgg_features_test = pretrained_model.predict_generator(test_data_gen)
train_target = to_categorical(list(map(get_actual_labels, train_data_gen.filenames)))
val_target = to_categorical(list(map(get_actual_labels, val_data_gen.filenames)))


Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None, 3)]   0         
                                                                 
 block1_conv1 (Conv2D)       (None, None, None, 64)    1792      
                                                                 
 block1_conv2 (Conv2D)       (None, None, None, 64)    36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, None, None, 64)    0         
                                                                 
 block2_conv1 (Conv2D)       (None, None, None, 128)   73856     
                                                                 
 block2_conv2 (Conv2D)       (None, None, None, 128)   147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, None, None, 128)   0     

In [40]:
model2 = get_model2()
history2 = model2.fit(
    vgg_features_train,
    train_target,
    epochs=epochs,
    validation_data=(vgg_features_val, val_target),
    class_weight=class_weights,
    callbacks=[
        early_stopping,
        checkpoint_callback
    ]
)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_3 (Flatten)         (None, 8192)              0         
                                                                 
 dense_6 (Dense)             (None, 100)               819300    
                                                                 
 dropout_9 (Dropout)         (None, 100)               0         
                                                                 
 batch_normalization_1 (Batc  (None, 100)              400       
 hNormalization)                                                 
                                                                 
 dense_7 (Dense)             (None, 2)                 202       
                                                                 
Total params: 819,902
Trainable params: 819,702
Non-trainable params: 200
______________________________________________

In [102]:
model2.save('model_vgg.h5')

In [49]:
results = model2.evaluate(x=vgg_features_test,y=to_categorical(list(map(get_actual_labels, test_data_gen.filenames))), verbose=0)

print("    Test Loss: {:.5f}".format(results[0]))
print("Test Accuracy: {:.2f}%".format(results[1] * 100))

    Test Loss: 0.02139
Test Accuracy: 99.56%


Other Models

In [81]:
# Training Data
total_images = train_labeled_data_gen.n  
steps = total_images//batch_size 

x_train , y_train = [] , []
for i in range(steps):
    a , b = train_labeled_data_gen.next()
    x_train.extend(a) 
    y_train.extend(b)

x_train = np.array(x_train)
y_train = np.array(y_train)
nsamples, nx, ny, nrgb = x_train.shape
x_train2 = x_train.reshape((nsamples,nx*ny*nrgb))

Decision Tree

In [80]:
dtc=DecisionTreeClassifier()
dtc.fit(x_train2,y_train)

In [84]:
dump(dtc, './other_models/DT.joblib')

['DT.joblib']

Random Forest

In [86]:
model=RandomForestClassifier()
model.fit(x_train2,y_train)

In [90]:
dump(model, './other_models/RF.joblib')

['RF.joblib']

KNN Classifier

In [88]:
knn=KNeighborsClassifier(n_neighbors=2)
knn.fit(x_train2,y_train)

In [92]:
dump(knn, './other_models/KNN.joblib')

['KNN.joblib']

Naive Bayes

In [89]:
nb=GaussianNB()
nb.fit(x_train2,y_train)

In [94]:
dump(nb, './other_models/NB.joblib')

['NB.joblib']

Zipping the other models

In [9]:
directory_to_zip = './other_models'
zip_file_path = 'other_models.zip'
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
    print(directory_to_zip)
    for root, directories, files in os.walk(directory_to_zip):
        for file in files:
            print(1)
            zip_file.write(os.path.join(root, file))

./other_models
1
1
1
1
