## Dog Breeds

https://www.kaggle.com/orangutan/keras-vgg19-starter

https://www.kaggle.com/gaborfodor/use-pretrained-keras-models-lb-0-3

    

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import pickle

import cv2                 # working with, mainly resizing, images
from random import shuffle # mixing up or currently ordered data that might lead our network astray in training.
from tqdm import tqdm

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import time
import matplotlib.pyplot as plt
% matplotlib inline

import time

In [2]:
os.chdir(r"D:\My Computer\DATA\Dog_Breed_Identification")

train_dir = "train"
test_dir = "test"
os.listdir()

['breed_class.csv',
 'labels.csv',
 'logs',
 'models',
 'sample_submission.csv',
 'save',
 'test',
 'train',
 'VGG19.csv']

In [3]:
labels = pd.read_csv("labels.csv")
sample_submission= pd.read_csv("sample_submission.csv")
labels.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [4]:
# ID has 32 characters
# File names have 36 characters
# suggesting the four last digits differentiate the dog within breeds

In [5]:
print("Dog Breed Number:", labels.shape)
print("Training Size:", len(os.listdir(train_dir)))
print("Test Size:", len(os.listdir(test_dir)))
print("Sample Sub Size:", sample_submission.shape)

Dog Breed Number: (10222, 2)
Training Size: 10222
Test Size: 10357
Sample Sub Size: (10357, 121)


In [6]:
def dataprep():
    targets_series = pd.Series(labels['breed'])
    one_hot = pd.get_dummies(targets_series, sparse = True)

    one_hot_labels = np.asarray(one_hot)

    im_size = 90

    x_train = []
    y_train = []
    x_test = []

    i = 0 
    for f, breed in tqdm(labels.values):
        img = cv2.imread('train/{}.jpg'.format(f))
        label = one_hot_labels[i]
        x_train.append(cv2.resize(img, (im_size, im_size)))
        y_train.append(label)
        i += 1

    # To Pickle
    for (i,x) in [(x_train, "x_train"), (y_train,"y_train"), (x_test, 'x_test')]:
        with open("{}.pickle".format(x), 'wb') as f:
            pickle.dump(i, f)

#dataprep()
        
for name in results["Model"]:
    open_file = open(os.path.join(path,"Pickle/{}.pickle".format(name)), "rb")
    dic[name] = pickle.load(open_file)
    open_file.close()

100%|███████████████████████████████████████████████████████████████████████████| 10222/10222 [00:20<00:00, 490.88it/s]


In [7]:
for f in tqdm(sample_submission['id'].values):
    img = cv2.imread('test/{}.jpg'.format(f))
    x_test.append(cv2.resize(img, (im_size, im_size)))
    


100%|███████████████████████████████████████████████████████████████████████████| 10357/10357 [00:21<00:00, 489.98it/s]


In [8]:
y_train_raw = np.array(y_train, np.uint8)
x_train_raw = np.array(x_train, np.float32) / 255.
x_test  = np.array(x_test, np.float32) / 255.

In [9]:
print(x_train_raw.shape)
print(y_train_raw.shape)
print(x_test.shape)

(10222, 90, 90, 3)
(10222, 120)
(10357, 90, 90, 3)


In [10]:
num_class = y_train_raw.shape[1]
X_train, X_valid, Y_train, Y_valid = \
train_test_split(x_train_raw, y_train_raw, test_size=0.15, random_state=1)

In [11]:
# Helpers
# Write
def write_model(model, modelname):
    preds = model.predict(x_test, verbose=0)
    sub = pd.DataFrame(preds)
    # Set column names to those generated by the one-hot encoding earlier
    col_names = one_hot.columns.values
    sub.columns = col_names
    # Insert the column id from the sample_submission at the start of the data frame
    sub.insert(0, 'id', sample_submission['id'])
    sub.to_csv("{}.csv".format(modelname), index=False)
       
    
# Store Result, Parameters and Validation Accuracy

In [12]:
import keras
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten

Using TensorFlow backend.


In [13]:
# VGG19 Pretrained Model
# Not sure if I really need to iterate over this?
from keras.applications.vgg19 import VGG19

# Research Base Models
base_model = VGG19(weights='imagenet',
                   include_top=False, input_shape=(im_size, im_size, 3))

# Add a new top layer
x = base_model.output
x = Flatten()(x)

# This is the model we will train
# This outputs the softmax, probabilistic consideration
predictions = Dense(num_class, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)

# First: train only the top layers (which were randomly initialized)
for layer in base_model.layers:
    layer.trainable = False

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 90, 90, 3)         0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 90, 90, 64)        1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 90, 90, 64)        36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 45, 45, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 45, 45, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 45, 45, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 22, 22, 128)       0         
__________

In [14]:
# Play with Min_delta
ES = keras.callbacks.EarlyStopping(monitor='val_acc', min_delta=0.0001, patience=40,
          verbose=1, mode='auto')
# Figure out how to assign name to it
TB = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=2,  
          write_graph=True, write_images=False)
# Broken?
MC = keras.callbacks.ModelCheckpoint('./save', monitor='val_acc', verbose=1,
                             save_best_only=True, save_weights_only=True,
                             mode='auto', period=5)


callbacks_list = [ES,TB]

In [15]:
# tensorboard --logdir=foo:"D:\My Computer\DATA\Dog_Breed_Identification\logs"

In [16]:
start = time.time()
model.fit(X_train, Y_train,
          validation_data=(X_valid, Y_valid),
          verbose=2, callbacks=callbacks_list,
          epochs=200)
end = time.time()
print("Model took %0.2f seconds to train"%(end - start))

Train on 8688 samples, validate on 1534 samples
Epoch 1/200
51s - loss: 4.5531 - acc: 0.0535 - val_loss: 4.2798 - val_acc: 0.0919
Epoch 2/200
14s - loss: 3.5916 - acc: 0.1953 - val_loss: 4.1482 - val_acc: 0.0939
Epoch 3/200
51s - loss: 3.1000 - acc: 0.2906 - val_loss: 4.0341 - val_acc: 0.1330
Epoch 4/200
14s - loss: 2.7420 - acc: 0.3714 - val_loss: 4.0110 - val_acc: 0.1297
Epoch 5/200
51s - loss: 2.4695 - acc: 0.4380 - val_loss: 4.0067 - val_acc: 0.1395
Epoch 6/200
14s - loss: 2.2335 - acc: 0.5033 - val_loss: 4.0338 - val_acc: 0.1278
Epoch 7/200
50s - loss: 2.0288 - acc: 0.5550 - val_loss: 4.0516 - val_acc: 0.1382
Epoch 8/200
14s - loss: 1.8598 - acc: 0.6047 - val_loss: 4.0890 - val_acc: 0.1362
Epoch 9/200
51s - loss: 1.7059 - acc: 0.6462 - val_loss: 4.1251 - val_acc: 0.1336
Epoch 10/200
14s - loss: 1.5735 - acc: 0.6838 - val_loss: 4.1369 - val_acc: 0.1362
Epoch 11/200
51s - loss: 1.4448 - acc: 0.7230 - val_loss: 4.1621 - val_acc: 0.1369
Epoch 12/200
14s - loss: 1.3351 - acc: 0.7484 - 

<keras.callbacks.History at 0x27bc0418358>

In [17]:
modelname = "VGG19"
write_model(model, modelname)

In [None]:
# Model