In [1]:
# Going to work only with sample/ first


path = "data/dogscats/"
#path = "data/dogscats/sample/"

In [2]:
from __future__ import division, print_function
import os, json
from glob import glob
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt

In [3]:
from numpy.random import random, permutation
from scipy import misc,ndimage
from scipy.ndimage.interpolation import zoom

import keras

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [4]:
# Loading class names from fast.ai website

FILES_PATH = 'http://files.fast.ai/models'
CLASS_FILE='imagenet_class_index.json'

import keras.utils.data_utils

fpath = keras.utils.data_utils.get_file(CLASS_FILE, FILES_PATH+CLASS_FILE, cache_subdir='models')
with open(fpath) as f:
    class_dict = json.load(f)
classes = [class_dict[str(i)][1] for i in range(len(class_dict))]

In [5]:
# Lets define the blocks that make up VGG16

def ConvBlock(layers, model, filters):
    for i in range(layers):
        model.add(keras.layers.convolutional.ZeroPadding2D((1,1)))
        model.add(keras.layers.convolutional.Convolution2D(filters, 3, 3, activation='relu'))
    model.add(keras.layers.convolutional.MaxPooling2D((2,2), strides=(2,2)))
    
def FCBlock(model):
    model.add(keras.layers.core.Dense(4096, activation='relu'))
    model.add(keras.layers.core.Dropout(0.5))

In [6]:
def vgg_preprocessing(x):
    x = x - np.array([123.68, 116.779, 103.939]).reshape((3,1,1))
    return x[:, ::-1]

In [7]:
def VGG_16():
    model = keras.models.Sequential()
    model.add(keras.layers.core.Lambda(vgg_preprocessing, input_shape=(3,224,224)))
    
    ConvBlock(2, model, 64)
    ConvBlock(2, model, 128)
    ConvBlock(3, model, 256)
    ConvBlock(3, model, 512)
    ConvBlock(3, model, 512)
    
    model.add(keras.layers.core.Flatten())
    FCBlock(model)
    FCBlock(model)
    model.add(keras.layers.core.Dense(1000, activation='softmax'))
    return model

In [8]:
# Instantiate model and populate it with downloaded weights
fpath = keras.utils.data_utils.get_file('vgg16.h5', FILES_PATH+'vgg16.h5', cache_subdir="models")
model = VGG_16()
model.load_weights(fpath)

  .format(self.name, input_shape))


In [9]:
# Now, we're going to create a new Model based on this one, but such that
# it does not go through the final dense_3 layer

features_model = keras.models.Model(input=model.input,
                       output=model.get_layer("dense_2").output)

In [10]:
# Model loaded.  It's BATCH TIME
import keras.preprocessing.image

batch_size = 64
# Random seed to ensure consistency beteween get_batches calls if desired
seed = 42

def get_batches(dirname, gen=keras.preprocessing.image.ImageDataGenerator(), 
                shuffle=True, batch_size=batch_size, class_mode='categorical',
                seed=None):
    return gen.flow_from_directory(path+dirname, target_size=(224,224), class_mode=class_mode,
                                   shuffle=shuffle, batch_size=batch_size, seed=seed)

In [11]:
batches = get_batches('train', seed=seed, batch_size=batch_size)
val_batches = get_batches('valid', seed=seed, batch_size=batch_size)

Found 25000 images belonging to 2 classes.
Found 0 images belonging to 2 classes.


In [12]:
# Success!  We have removed the final layer.  Now we have access to the entire previous 4096 output.
# Lets try training an SVM on the top layer.  First thing we need to do to train this SVM is to
# take the entire data set ans pass it through vgg to turn it into 4096-length vectors.

# We regenerate the batches just in case (ensuring generator is at beginning)
batches = get_batches('train', seed=42, batch_size=batch_size)
features_vec = features_model.predict_generator(batches,batches.nb_sample)


Found 25000 images belonging to 2 classes.


In [13]:
batches = get_batches('train', seed=seed, batch_size=batch_size)

num_images = batches.nb_sample

binary_class_vec = np.full((num_images), -1., dtype=np.float32)
#Iterate first amongst the batches, then through each batch

num_batches = num_images // batch_size
last_batch_size = num_images % batch_size

for i in range(num_batches):
    batch = next(batches)
    binary_class_vec[i*batch_size:(i+1)*batch_size] = batch[1][:,0]
        
batch = next(batches)
for j in range(last_batch_size):
    binary_class_vec[(num_batches)*batch_size + j] = batch[1][j,0]

Found 25000 images belonging to 2 classes.


In [14]:
# It's time to sklearn!

from sklearn import svm
classifier = svm.SVC()
assert features_vec.shape[0] == binary_class_vec.shape[0]
print(features_vec.shape)
print(binary_class_vec.shape)
classifier.fit(features_vec, binary_class_vec)

(25000, 4096)
(25000,)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
# Alright!  Lets get predictions for the validation batch and see how we did
#val_batches = get_batches('valid', seed=seed, batch_size=batch_size)

# Pass them through vgg
#val_features_vec = features_model.predict_generator(val_batches,val_batches.nb_sample)

In [16]:
#val_features_vec[1]

# What do we get?
#pred_labels = classifier.predict(val_features_vec.reshape(-1,1))

In [17]:
# Now, we want to compare these predicted lables to the original labels.
# First, lets construct np.array with the ground truth

#val_batches = get_batches('valid', seed=seed, batch_size=batch_size)

#num_images = val_batches.nb_sample

#actual_labels = np.empty((num_images),dtype=np.float32)
# Iterate first amongst the batches, then through each batch

#size_batches = num_images
#num_batches = num_images // batch_size
#last_batch_size = num_images % batch_size

#for i in range(num_batches):
#    val_batch = next(val_batches)
#    actual_labels[i*batch_size:(i+1)*batch_size] = val_batch[1][:,0]

#batch = next(val_batches)
#actual_labels[num_batches*batch_size:num_batches*batch_size+last_batch_size] = batch[1][:last_batch_size,0]

In [18]:
#import sklearn.metrics
#accuracy = sklearn.metrics.accuracy_score(actual_labels, pred_labels)
#print(accuracy)

In [19]:
# Lets get the test data
# We don't shuffle the test batch.  Good lord please don't shuffle a test batch ever again, ok thanks
test_batches = get_batches("test1", shuffle=False, batch_size=batch_size, class_mode=None)

Found 12500 images belonging to 1 classes.


In [20]:
val_batches.class_indices

{'cat': 0, 'dog': 1}

In [22]:
# Now, lets do our predictions:
test_features = features_model.predict_generator(test_batches,test_batches.nb_sample)
pred_test_labels = classifier.predict(test_features)

In [28]:
# Wohoo predictions!  Lets save them to a file
with open(path+'my_submission.csv', 'w') as f:
    f.write("id,label\n")
    for i, ele in enumerate(pred_test_labels):
        f.write("{},".format(i))
        if (ele==0.):
            f.write(str(1))
            f.write("\n")
        else:
            f.write(str(0))
            f.write("\n")

In [29]:
from IPython.display import FileLink
FileLink(path+"my_submission.csv")