In [1]:
import os
import time
import shutil
import math

import pandas as pd
import numpy as np

import imageio

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from scipy import ndimage
import scipy.misc

from sklearn.metrics import confusion_matrix

import seaborn as sns

import itertools

import matplotlib.pyplot as plt

import tensorflow.keras
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Activation, Flatten, Dense, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator , array_to_img, img_to_array, load_img
from tensorflow.keras.layers import BatchNormalization
# from tensorflow.keras.convolutional import *
%matplotlib inline

Constants

In [12]:
SELF_PATH = os.getcwd()
PATH_TO_FILE_STREAM = 'G:\\Shared drives\\P-Sick'
PATH_TO_IMAGES = os.path.join(PATH_TO_FILE_STREAM, 'small\\Florence\\20180917a_jpgs\\jpgs')
PATH_TO_TAG_CSV = 'G:\\Shared drives\\P-Sick\\tag_csv\\tagging_data.csv'

PERCENT_TEST = 0.2
PERCENT_TRAINING = (1-PERCENT_TEST)
PERCENT_VALIDATION = 0.2
PERCENT_VALIDATION = PERCENT_VALIDATION/PERCENT_TRAINING

RESIZE_HEIGHT,RESIZE_WIDTH = 224,224

TRAINING_IMAGE_FOLDER = "..\\_training_images"
TESTING_IMAGE_FOLDER = "..\\_testing_images"
VALIDATION_IMAGE_FOLDER = "..\\_validation_images"

NUM_TRAINING_BATCHES = 20
NUM_TESTING_BATCHES = 20
NUM_VALIDATION_BATCHES = 10

IMPACT_CLASSES = ['0','1','2','3','4']
IMPACT_CLASSES_INT = [0,1,2,3,4]

These are a list of the ids for impact
```
NoneId:0  
SwashId:1  
CollisionId:2  
OverwashId:3  
InundationId:4
```

# Getting the image data

1. We need to load the data from the csv
2. Split the images up into training and test set, and then place them in seperate folders.

First lets load the csv that has all the completely tagged image tags.

In [13]:
df_image_tags = pd.read_csv(PATH_TO_TAG_CSV)
df_impact_images = df_image_tags[['image_id','impact']]

# Creating the training,testing and validation sets

In [14]:
df_training_images, df_testing_images = train_test_split(df_impact_images, test_size=PERCENT_TEST , random_state=1123567) #
df_training_images, df_validation_images = train_test_split(df_training_images, test_size=PERCENT_VALIDATION, random_state=67123346) #, random_state=1337

NUM_TRAINING_IMAGES = len(df_training_images)
NUM_TESTING_IMAGES = len(df_testing_images)
NUM_VALIDATION_IMAGES = len(df_validation_images)

SIZE_TRAINING_BATCH = NUM_TRAINING_IMAGES//NUM_TRAINING_BATCHES
SIZE_TESTING_BATCH = NUM_TESTING_IMAGES//NUM_TESTING_BATCHES
SIZE_VALIDATION_BATCH = NUM_VALIDATION_IMAGES//NUM_VALIDATION_BATCHES

print('Number of images',NUM_TRAINING_IMAGES,NUM_TESTING_IMAGES,NUM_VALIDATION_IMAGES)

print('Batch sizes',SIZE_TRAINING_BATCH,SIZE_TESTING_BATCH,SIZE_VALIDATION_BATCH)

Number of images 148 50 50
Batch sizes 7 2 5


# Get the images into the folders

In [15]:
gen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True)

Before we copy the images, remove all files within these folders

In [16]:
def remove_folders(folders):
    for folder in folders:
        try:
            shutil.rmtree(folder)
        except:
            pass

In [24]:
remove_folders([TRAINING_IMAGE_FOLDER,TESTING_IMAGE_FOLDER,VALIDATION_IMAGE_FOLDER])

Create the training, test, and validation folders, and in each have a folder where the folders name is the impact type(0 to 4)

In [18]:
def make_folders(folders):
    for folder in folders:
        try:
            os.mkdir(folder)
        except:
            pass
        
        try:
            for i in ['1','2','3','4']:#IMPACT_CLASSES:
                os.mkdir(os.path.join(folder, f'{i}'))
        except:
            pass

In [25]:
make_folders([TRAINING_IMAGE_FOLDER,TESTING_IMAGE_FOLDER,VALIDATION_IMAGE_FOLDER])

Copy our training, test, and vaildation images into their respective folders, while keeping metadata with copy2

In [32]:
def copy_images(dict_of_data_sets):
    for sets in dict_of_data_sets:
        for index, row in sets['df'].iterrows():
            if(not str(row['impact']) == '0'):   
                impact_path =  os.path.join(sets['folder_path'],str(row['impact']))
                full_new_path =  os.path.join(impact_path,row['image_id'])

                meme = shutil.copy2(os.path.join(PATH_TO_IMAGES,row['image_id']),  full_new_path)
                
                img = load_img(full_new_path) 

                x = img_to_array(img)  # this is a Numpy array with shape (3, 150, 150)
                x = x.reshape((1,) + x.shape)  # this is a Numpy array with shape (1, 3, 150, 150)

                num_augmented_img = 0
                max_augmented_images = 8
                for batch in gen.flow(x, batch_size=1,
                                        save_to_dir=impact_path, save_prefix='augmented', save_format='jpeg'):
                    num_augmented_img += 1
                    if num_augmented_img > max_augmented_images:
                        break  # otherwise the generator would loop indefinitely
    
    print('done copying')

In [33]:
copy_images([
    {
        'folder_path':TRAINING_IMAGE_FOLDER,
        'df':df_training_images
    },
    {
        'folder_path':TESTING_IMAGE_FOLDER,
        'df':df_testing_images
    },
     {
        'folder_path':VALIDATION_IMAGE_FOLDER,
        'df':df_validation_images
    }
])

done copying


In [22]:
df_testing_images

Unnamed: 0,image_id,impact
10,C26051403.jpg,0
229,C26047788.jpg,2
208,P26054595.jpg,3
245,P26051780.jpg,0
85,P26057336.jpg,2
60,C26049720.jpg,0
130,P26049162.jpg,0
101,P26060200.jpg,3
121,C26052087.jpg,0
156,P26058106.jpg,3


# Preprocessing the images

1. create NP arrays for Keras by pointing to where the images are for each set, since the images in each set are in folders labeled 0-4 Keras will be able to apply a class to them

I also tried to find out how to determine a good batch size, and I found [this](https://stats.stackexchange.com/questions/164876/tradeoff-batch-size-vs-number-of-iterations-to-train-a-neural-network) which while not the most scholarly of places, is good enough for me 

In [0]:
training_batch = ImageDataGenerator().flow_from_directory(TRAINING_IMAGE_FOLDER, target_size = (RESIZE_HEIGHT,RESIZE_WIDTH), classes = IMPACT_CLASSES, batch_size = SIZE_TRAINING_BATCH)
testing_batch = ImageDataGenerator().flow_from_directory(TESTING_IMAGE_FOLDER, target_size = (RESIZE_HEIGHT,RESIZE_WIDTH), classes = IMPACT_CLASSES, batch_size = SIZE_TESTING_BATCH)
validation_batch = ImageDataGenerator().flow_from_directory(VALIDATION_IMAGE_FOLDER, target_size = (RESIZE_HEIGHT,RESIZE_WIDTH), classes = IMPACT_CLASSES, batch_size = SIZE_VALIDATION_BATCH)

This function `plots` is used to just show some of the images in a batch and their class

In [0]:
def plots(ims, figsize=(12,6), rows=1, interp=False, titles=None):
    if type(ims[0]) is np.ndarray:
        ims = np.array(ims).astype(np.uint8)
        if (ims.shape[-1] != 3):
            ims = ims.transpose((0,2,3,1))

    f = plt.figure(figsize=figsize)
    cols = len(ims)//rows if len(ims) % 2 == 0 else len(ims)//rows + 1
    
    for i in range(len(ims)):
        sp = f.add_subplot(rows, cols, i+1)
        sp.axis('Off')
        if titles is not None:
            sp.set_title(titles[i], fontsize=16)
        plt.imshow(ims[i], interpolation=None if interp else 'none')

In [0]:
imgs, labels = next(training_batch)
plots(imgs,titles=labels,figsize=(24,6),rows = 1)

# Create Model

In [0]:
model_vgg16 = tensorflow.keras.applications.vgg16.VGG16()
model_vgg16.summary()

In [0]:
model = Sequential()
for layer in model_vgg16.layers[:-1]:
    model.add(layer)

In [0]:
for layer in model.layers:
    layer.trainable = False

In [0]:
model.add(Dense(len(IMPACT_CLASSES), activation = 'softmax'))
model.summary()

In [0]:
model.compile(Adam(lr=0.0001),loss='categorical_crossentropy', metrics = ['accuracy'])

# Train and validate Model

In [0]:
STEPS_PER_EPOCH= 5
VALIDATION_STEPS = 5
NUM_EPOCH = 5

model.fit_generator(training_batch,steps_per_epoch = STEPS_PER_EPOCH, validation_data = validation_batch, validation_steps = VALIDATION_STEPS, epochs = NUM_EPOCH, verbose =2)

# Predicting on test set

In [0]:
predictions = model.predict(testing_batch,verbose = 2)
predictions[:3]

In [0]:
rounded_predictions = model.predict_classes(testing_batch,batch_size = None,verbose = 2)
rounded_predictions[:3]

In [0]:
cm = confusion_matrix(y_pred = rounded_predictions,y_true = testing_batch.labels,labels = IMPACT_CLASSES_INT)

In [0]:
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax,cmap=sns.diverging_palette(240, 10, n=25)); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(IMPACT_CLASSES_INT); ax.yaxis.set_ticklabels(IMPACT_CLASSES_INT);