In [1]:
# Initialization

%matplotlib inline

In [20]:
DATA_DIR = '/home/ubuntu/ofir/data/rightwhale/'
SAVED_WEIGHTS_DIR = DATA_DIR + 'saved-weights/'

In [2]:
# Imports

import numpy as np
import pandas as pd
import os, glob
from matplotlib import pyplot as plt
import csv
from tqdm import tqdm
from keras import models, layers
from keras.utils.data_utils import get_file
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
# import json

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


# Right Whale Competition

TODO:

1. Download the data
1. Pre-process the data
1. Prepare a training, sample, valid, test dirs
1. Finetune and fit the data
1. Predict results
1. Create a submittions file
1. Submit to Kaggle.

## Download the data

Done in bash into DATA_DIR

## Pre-process the data

Skipping this step for now. Let's start with a naive approach and see how it goes from there.

## Prepare a training, valid, sample, test dirs

In [4]:
# Read the CSV file to get the classes of the whales
traindf = pd.read_csv(DATA_DIR + 'train.csv', sep=',',)

In [78]:
# Build the train folders based on the whale IDs
for whale_id in tqdm(traindf['whaleID'].unique()):
    %mkdir -p '$DATA_DIR'train/'$whale_id'/
    
# Make sample, valid and test dirs
%mkdir -p '$DATA_DIR'valid/
%mkdir -p '$DATA_DIR'test/
%mkdir -p '$DATA_DIR'sample/train/
%mkdir -p '$DATA_DIR'sample/valid/
%mkdir -p '$SAVED_WEIGHTS_DIR'

In [81]:
# Move images from imgs to /train/
for row in tqdm(traindf.iterrows()):
    whale_id = row[1]['whaleID']
    image_name = row[1]['Image']
    %mv '$DATA_DIR'imgs/'$image_name' '$DATA_DIR'train/'$whale_id'/
    
# Note that for some odd reason, I got a message that there's no such file or directory as "w_7489.jpg".
# Upon further inspection, I couldn't find such a file in images.zip, and the file name does exist in train.csv

mv: cannot stat '/home/ubuntu/ofir/data/rightwhale/imgs/w_7812.jpg': No such file or directory
mv: cannot stat '/home/ubuntu/ofir/data/rightwhale/imgs/w_7489.jpg': No such file or directory


In [83]:
# Move the rest of the images from imgs to /test/
%mv '$DATA_DIR'imgs/* '$DATA_DIR'test/

In [109]:
# Move one of each training samples to the valid folder
for whale_id in tqdm(traindf['whaleID'].unique()):
    image_name = traindf[traindf['whaleID'] == whale_id].iloc[0]['Image']
    %mkdir -p '$DATA_DIR'valid/'$whale_id'/
    %mv '$DATA_DIR'train/'$whale_id'/'$image_name' '$DATA_DIR'valid/'$whale_id'/

  0%|          | 0/447 [00:00<?, ?it/s]

mv: cannot stat '/home/ubuntu/ofir/data/rightwhale/train/whale_48813/w_7812.jpg': No such file or directory


100%|██████████| 447/447 [01:41<00:00,  4.42it/s]


In [14]:
# Copy some of the whale IDs to the sample dirs (train & valid)

sample_whale_ids = traindf.sample(10)['whaleID'].unique()

for sample_id in tqdm(sample_whale_ids):
    %cp -r '$DATA_DIR'train/'$sample_id' '$DATA_DIR'sample/train/
    %cp -r '$DATA_DIR'valid/'$sample_id' '$DATA_DIR'sample/valid/

100%|██████████| 10/10 [00:03<00:00,  2.92it/s]


## Create the VGG16 model

In [4]:
# Create helper functions for creating the network layers

def FCBlock(output_dim, activation='relu', **kwargs):
    return layers.Dense(output_dim=output_dim, activation=activation, **kwargs)

def ConvBlock(model, layers_num, filters):
    
    for i in range(layers_num):
        model.add(layers.ZeroPadding2D(padding=(1,1)))
        model.add(layers.Conv2D(nb_filter=filters, nb_row=3, nb_col=3, activation='relu'))
    
    model.add(layers.MaxPooling2D(pool_size=(2,2), strides=(2,2)))
#     model.add(layers.Dropout(0.5))

vgg_mean = np.array([123.68, 116.779, 103.939]).reshape(3,1,1)
def vgg_preprocessing(x):
    
    # Make the mean 0 relative to VGG16.
    x -= vgg_mean
    
    # RGB -> BGR
    return x[:, ::-1]

In [5]:
# Build the VGG16 network

model = models.Sequential()

model.add(layers.Lambda(vgg_preprocessing, input_shape=(3,224,224)))

ConvBlock(model, 2, 64)
ConvBlock(model, 2, 128)
ConvBlock(model, 3, 256)
ConvBlock(model, 3, 512)
ConvBlock(model, 3, 512)
# model.add(Conv2DBlock(input_shape=(224,224,3,)), filters=64)

model.add(layers.Flatten())
model.add(FCBlock(output_dim=4096))
model.add(FCBlock(output_dim=4096))
model.add(FCBlock(output_dim=1000, activation='softmax'))


In [6]:
# Load the weights into the model
FILES_PATH = 'http://www.platform.ai/models/';
fpath = get_file('vgg16.h5', FILES_PATH+'vgg16.h5', cache_subdir='models')
model.load_weights(fpath)

## Finetune and fit the data

In [7]:
# More helper functions
def get_batches(directory, gen=ImageDataGenerator()):
    return gen.flow_from_directory(directory, shuffle=False, batch_size=64, target_size=(224,224))

In [8]:
train_batches = get_batches(DATA_DIR + 'train/')
valid_batches = get_batches(DATA_DIR + 'valid/')

Found 4096 images belonging to 447 classes.
Found 447 images belonging to 447 classes.


In [9]:
# Finetune the model by replacing the last layer

model.pop()
for layer in model.layers:
    layer.trainable = False
model.add(FCBlock(output_dim=train_batches.nb_class, activation='softmax'))


In [10]:
# Store the classes
train_batches.class_indices
classes = list(train_batches.class_indices)
for class_name, class_idx in train_batches.class_indices.items():
    classes[class_idx] = class_name

In [11]:
model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
model.fit_generator(
    train_batches, samples_per_epoch=train_batches.nb_sample, 
    validation_data=valid_batches, nb_val_samples=valid_batches.nb_sample, 
    nb_epoch=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f06fbcb3750>

In [23]:
# Save the weights of the first 5 epochs of the training
model.save_weights(filepath=SAVED_WEIGHTS_DIR + 'fit5.h5f')

In [24]:
# Train again for 5 epochs and save the weights again
model.fit_generator(
    train_batches, samples_per_epoch=train_batches.nb_sample, 
    validation_data=valid_batches, nb_val_samples=valid_batches.nb_sample, 
    nb_epoch=5
)
model.save_weights(filepath=SAVED_WEIGHTS_DIR + 'fit10.h5f')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
# Train again for 5 epochs and save the weights again
model.fit_generator(
    train_batches, samples_per_epoch=train_batches.nb_sample, 
    validation_data=valid_batches, nb_val_samples=valid_batches.nb_sample, 
    nb_epoch=5
)
model.save_weights(filepath=SAVED_WEIGHTS_DIR + 'fit15.h5f')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
def predict(gen):
    confs = model.predict_generator(gen, gen.nb_sample)
    preds = np.argmax(confs, axis=1)
    confs = [confs[i][preds[i]] for i in range(len(preds))]
    
    return preds, confs, np.array(classes)[preds]
    
# p1 = np.argmax(preds, axis=1)
# conf = [preds[i][p1[i]] for i in range(len(p1))]
# np.array(classes)[p1], p1, classes

In [17]:
preds, confs, pred_classes = predict(valid_batches)

In [18]:
pred_classes[:10], valid_batches.filenames[:10]

(array(['whale_89615', 'whale_90911', 'whale_87604', 'whale_90911',
        'whale_86158', 'whale_90911', 'whale_89615', 'whale_89615',
        'whale_90911', 'whale_87604'], 
       dtype='|S11'),
 ['whale_00195/w_6326.jpg',
  'whale_00442/w_9183.jpg',
  'whale_02411/w_2577.jpg',
  'whale_02608/w_6600.jpg',
  'whale_02839/w_4678.jpg',
  'whale_03103/w_8706.jpg',
  'whale_03227/w_6695.jpg',
  'whale_03623/w_1617.jpg',
  'whale_03728/w_906.jpg',
  'whale_03935/w_3236.jpg'])

## Predict results

## Create a submittions file

## Submit to Kaggle

---

### Thoughts for improving:

1. Crop the images
1. Use the face detection suggestion from Kaggle.
1. Add shuffle=True to the training examples for additional randomness
1. Retrain more than one layer (don't skip the step of finetuning the last layer first, since otherwise the random weights of the last layer will through off the weights of the other layers).

### Other tips:

1. Check the model's summary
1. Check for random correct\incorrect predictions, The most confident correct/incorrect predictions, and the most unsure predictions.
1. Load weights right after saving them in order to make sure they're saved correctly.