# Assignment_notebook CNN

Import modules

In [12]:
# System tools
import os
import sys
import argparse
sys.path.append(os.getcwd())

# Data tools
import numpy as np
from tqdm import tqdm
import pandas as pd
from random import sample
from itertools import chain

# tf tools
import tensorflow as tf

# image processsing
from tensorflow.keras.preprocessing.image import (load_img,
                                                  img_to_array,
                                                  ImageDataGenerator)
# VGG16 model
from tensorflow.keras.applications.vgg16 import (preprocess_input,
                                                 decode_predictions,
                                                 VGG16)
# layers
from tensorflow.keras.layers import (Flatten, 
                                     Dense, 
                                     Dropout, 
                                     BatchNormalization)
# generic model object
from tensorflow.keras.models import Model

# optimizers
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.optimizers import SGD

#scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import classification_report

__Loading data__ 

In [2]:
def load_data(nr_files):
    # > load y data 
    filepath = os.path.join("..", "in", "processed_data", "y_data.npy")
    # Load array
    y = np.load(filepath)
    # Choose relevant y data  
    y = y[:nr_files]
    
    # > Load file_list to be certain that X data will be in the same order as y 
    # Get the filepath
    filepath = os.path.join("..", "in", "processed_data", "file_list.csv")
    # Reading the filepath 
    file_list = pd.read_csv(filepath)
    
    # Choose which files to load
    y_filenames = file_list["files"][:nr_files]
    
    # Define empthy list 
    X = []
    # Iterate over images to load as arrays
    for file in tqdm(y_filenames):
        # Get filepath for image
        filepath = os.path.join("..", "in", "np_arrays", file)
        # Load array
        loaded_array = np.load(filepath)
        # Append to list
        X.append(loaded_array)

    # Making sure that both X and y are numpy arrays
    X = np.array(X)
    y = np.array(y)

    # Splitting data 
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    test_size = 0.2)
    
    return X_train, X_test, y_train, y_test

In [91]:
def load_data_nn(sample = None, size = 210):
    # Print info
    print("[INFO] Loading data...")
    
    # > load y data 
    filepath = os.path.join("..", "in", "processed_data", "y_data.npy")
    # Load array
    y = np.load(filepath)
    
    # > Load file_list to be certain that X data will be in the same order as y 
    # Get the filepath
    filepath = os.path.join("..", "in", "processed_data", "file_list_npy.csv")
    # Reading the filepath 
    file_list = pd.read_csv(filepath)
    
    # Choose to sample or not
    if sample: 
        # Convert y to dataframe to sample
        y_df = pd.DataFrame(y, columns =["label"])
        # Sample and get index
        y_sample = (y_df.groupby("label", as_index=False)
                    .apply(lambda x: x.sample(n=sample, replace=False).index)
                    .reset_index(drop=True))
        # Convert 2d list if indexes into 1d
        flatten_list = list(chain.from_iterable(y_sample))
        # Use indexes to find the y values in the sample
        y_relevant = np.array([y[i] for i in flatten_list])
        
        # Define list of files to iretate over
        y_filenames = [file_list["files"].tolist()[i] for i in flatten_list]
    else:
        # Define list of files to iretate over
        y_filenames = file_list["files"].tolist()
        # Define relevant y values
        y_relevant = y
    
    # Define empthy list 
    X = []
    # Iterate over images to load as arrays
    for file in tqdm(y_filenames):
        # Get filepath for image
        filepath = os.path.join("..", "in", "np_arrays", file)
        # Load array
        loaded_array = np.load(filepath)
        # Append to list
        X.append(loaded_array)

    # Making sure that both X and y are numpy arrays
    X = np.array(X)
    y_relevant = np.array(y_relevant)

    # Splitting data 
    X_train, X_test, y_train, y_test = train_test_split(X, y_relevant,
                                                    random_state=42,
                                                    test_size = 0.2)
    
    return X_train, X_test, y_train, y_test, y_relevant

In [94]:
X_train, X_test, y_train, y_test, y_relevant = load_data_nn(100)

[INFO] Loading data...


100%|██████████| 2800/2800 [02:04<00:00, 22.48it/s]


In [85]:
X_train.shape

(224, 210, 210, 3)

In [90]:
len(y_df["label"].value_counts())

28

In [58]:
y_train.shape

(224, 1)

In [59]:
len(y_test) + len(y_train)

280

In [60]:
y_train[:10]

array([[ 8],
       [13],
       [18],
       [ 0],
       [12],
       [23],
       [19],
       [17],
       [ 1],
       [ 1]])

In [95]:
# Lowest nr is 276
for i in range(28):
    print(np.count_nonzero(y_test == i))

16
21
23
16
30
22
14
23
13
17
24
23
25
24
20
17
18
21
16
21
20
20
24
19
21
19
11
22


__Prepare data__

In [96]:
# > Prepate data
def normalize(X_train, X_test, y_train, y_test):
    # Normalize data 
    X_train = X_train/255
    X_test = X_test/255
    # Create label encodings 
    #le = LabelEncoder()
    #y_train = le.fit_transform(y_train.ravel())
    #y_test = le.transform(y_test.ravel())
    # Initialize label names
    lb = LabelBinarizer ()
    y_train = lb.fit_transform(y_train)
    y_test = lb.fit_transform(y_test)
    
    
    return X_train, X_test, y_train, y_test

In [97]:
X_train, X_test, y_train, y_test = normalize(X_train, X_test, y_train, y_test)

In [98]:
y_test.shape

(560, 28)

__Create model__

In [99]:
import tensorflow as tf 
tf.keras.backend.clear_session()

In [100]:
# > Create model
def create_model(size = 210):
    # Print info 
    print("[INFO] Initializing model")
    
    # > Initialize model 
    model = VGG16(include_top = False, # Do not include classifier!
                  pooling = "avg", # Pooling the final layer  
                  input_shape = (int(size), int(size), 3)) # Defineing input shape
    # Disable training on convolutional layers
    for layer in model.layers:
        layer.trainable = False
        
    # > Add layers 
    # The second pair of closed brackets is the input 
    flat1 = Flatten()(model.layers[-1].output) # create a flatten layer from the output for the last layer of the model
    class1 = Dense(128, activation='relu')(flat1)
    output = Dense(28, activation='softmax')(class1)
    # Adding everything together
    model = Model(inputs = model.inputs, 
                  outputs = output)
    
    # Print info
    print("[INFO] Compiling model")
    # Slowing down the model's learning to avoid overfitting
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.01,
        decay_steps=1000,
        decay_rate=0.9)

    sgd = SGD(learning_rate=lr_schedule)
    # Compiling model
    model.compile(optimizer=sgd,
             loss="categorical_crossentropy", # binary_crossentropy for binary categories 
             metrics=["accuracy"])
    
    # Print info
    print("[INFO] Model compiled!")
    print("[INFO] Model summary:")
    model.summary()
    
    return model


In [101]:
model = create_model()

[INFO] Initializing model
[INFO] Compiling model
[INFO] Model compiled!
[INFO] Model summary:
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 210, 210, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 210, 210, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 210, 210, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 105, 105, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 105, 105, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 105, 105, 128)     147584    
                                 

__Train model__

In [102]:
history = model.fit(X_train, y_train,
             validation_data = (X_test, y_test), # Was there a way to split up the validation data further?
             batch_size = 128, # two to the power of something to optimize memory
             epochs = 3,
             validation_split = 0.1,
             verbose = 1)

Epoch 1/3
Epoch 2/3
Epoch 3/3
