## 1. Import Datasets

In [44]:
import os
import numpy as np
import pandas as pd

In [2]:
DIR = os.path.join(os.getcwd(), 'data/')
train_df = pd.read_csv(DIR + 'train.csv')
test = pd.read_csv(DIR + 'test_easy.csv')

In [3]:
train_df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [4]:
images = train_df['Image']
ids = train_df['Id']
classes = train_df.Id.unique()
len(classes)

5005

In [5]:
#  Encode the class names
from collections import defaultdict
labels = defaultdict(int)
i = 0
for name in classes:
    labels[name] = i
    i += 1

In [54]:
# decode. map the name id to real name
i = 0
names = {}
for name in classes:
    names[i] = name
    i += 1

In [6]:
# map the whale number to its name id
nums = train_df.shape[0]
total_targets = np.zeros(nums, dtype=int)
for i in range(nums):
    total_targets[i] = labels[ids[i]]

In [7]:
# one hot the targets 
import keras
from keras.utils import np_utils

def whale_name2one_hot(name):
    id_ = labels[name]
    total_targets = np_utils.to_categorical(total_targets, nums)
    return total_targets

def multiProcess(function, data):
    p = Pool(6)
    return p.map(function, data)

Using TensorFlow backend.


In [8]:
from PIL import Image
W = 1050
H = 450
def resize_image(filename):
    image = Image.open(filename)
    image = image.astype("float32")/255
    image = image.resize((W, H), Image.ANTIALIAS)
    image.save(DIR + 'train_resized/' + os.path.basename(filename))

In [9]:
test_files = os.listdir(DIR + 'test/')  # 这里只显示后缀

In [10]:
## finished
from multiprocessing import Pool
# p = Pool(6)
# p.map(resize_image, [(DIR + 'train/' + image) for image in images])


In [11]:
from glob import glob
image_files = glob(DIR + "train_resized/*")

In [12]:
from sklearn.model_selection import StratifiedKFold
stratSplit = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = list(stratSplit.split(image_files, ids))



In [31]:
for i in range(10):
    train_id_i, val_id_i = folds[i]
    pd.DataFrame(train_id_i).to_csv(DIR + '/folds/fold_' + str(i) + '_train', header=False, index=False)
    pd.DataFrame(val_id_i).to_csv(DIR + '/folds/fold_' + str(i) + '_valid', header=False, index=False)

In [35]:
# train_idx, val_idx = folds[1]
train_idx = pd.read_csv(DIR + '/folds/fold_0_train', header=None)
val_idx = pd.read_csv(DIR + '/folds/fold_0_valid', header=None)

In [38]:
image_files = np.array(image_files)
total_targets = np.array(total_targets)

In [39]:
import keras
from keras.preprocessing import image

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, image_files, ids, targets, batch_size=32, dim=(1050, 450, 3),
                 n_channels=3, n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.image_files = image_files
        self.labels = targets
        self.ids = ids
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()
    
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.ids)) / self.batch_size)
    
    def on_epoch_end(self):
        'Updates indices after each epoch'
        self.indices = np.arange(len(self.ids))
        if self.shuffle:
            np.random.shuffle(self.indices)
    
    def __data_generation(self, ids_batch):
        'Generates data containing batch_size samples'
        # Initialization
        X = np.empty((self.batch_size, self.dim[0], self.dim[1], self.n_channels))
        y = np.empty((self.batch_size), dtype=int)
        
        # Generate data
        for i, ID in enumerate(ids_batch):
            # Store sample
            img = image.load_img(self.image_files[ID])
            if img.size != (1050, 450):
                print(img.size)
            X[i,] = image.img_to_array(img)
            
            # Store class
            y[i] = self.labels[ID]
            
        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)
        
        
    def __getitem__(self, index):
        'Generate one batch of data'
        indices = self.indices[index * self.batch_size: (index+1) * self.batch_size]
        
        # Find list of IDs
        ids_batch = [self.ids[k] for k in indices]
        
        # Generate data
        X, y = self.__data_generation(ids_batch)
        
        return X, y
    

In [40]:
train_targets = total_targets[train_idx]
valid_targets = total_targets[val_idx]

In [47]:
from keras.models import Sequential

# Parameters
params = {'dim': (W, H, 3),
        'batch_size': 64,
        'n_classes': 5005,
        'n_channels': 3,
        'shuffle': True}

# Generators
train_generator = DataGenerator(image_files, train_idx, total_targets, dim=(H, W),
                                batch_size=64, n_classes = len(classes))
valid_generator = DataGenerator(image_files, val_idx, total_targets, dim=(H, W), 
                                batch_size=64, n_classes = len(classes))


In [45]:
# resnet
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D
from keras.layers import Dense, Flatten, Activation, Input, add
from keras.layers.normalization import BatchNormalization
from keras.layers.noise import GaussianNoise
from keras.models import Model

def Conv2d_BN(x, nb_filter, kernel_size, strides=(1,1), 
              padding='same', name=None):
    if name is not None:
        bn_name = name + '_bn'
        conv_name = name + '_conv'
    else:
        bn_name = None
        conv_name = None
    
    x = Conv2D(nb_filter, kernel_size, padding=padding, 
               strides=strides, name=conv_name)(x)
    x = BatchNormalization(axis=3, name=bn_name)(x)
    x = Activation('relu')(x)
    return x
 

def identity_Block(inpt, nb_filter, kernel_size, strides=(1,1), 
                   with_conv_shortcut=False):
    
    x = Conv2d_BN(inpt, nb_filter=nb_filter, kernel_size=kernel_size, strides=strides, padding='same')
    x = Conv2d_BN(x, nb_filter=nb_filter, kernel_size=kernel_size, padding='same')

    if with_conv_shortcut:
        shortcut = Conv2d_BN(inpt, nb_filter=nb_filter, kernel_size=kernel_size, strides=strides)
        x = add([x, shortcut])
        return x
    else:
        x = add([x, inpt])
        return x
    
    
def resnet(width, height, channel, classes):
    inpt = Input(shape=(width, height, channel))
    
    #conv1
    x = Conv2d_BN(inpt, nb_filter=16, kernel_size=(3, 3), strides=(2, 2), padding='valid')
    x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)
    x = GaussianNoise(0.2)(x)
    
    #conv2_x
    x = identity_Block(x, nb_filter=16, kernel_size=(3, 3))
    x = identity_Block(x, nb_filter=16, kernel_size=(3, 3))   
    x = identity_Block(x, nb_filter=16, kernel_size=(3, 3))   

    #conv3_x
#     x = identity_Block(x, nb_filter=32, kernel_size=(3, 3), strides=(2, 2), with_conv_shortcut=True)
#     x = identity_Block(x, nb_filter=32, kernel_size=(3, 3))
#     x = identity_Block(x, nb_filter=32, kernel_size=(3, 3))

    #conv4_x
#     x = identity_Block(x, nb_filter=64, kernel_size=(3, 3), strides=(2, 2), with_conv_shortcut=True)
#     x = identity_Block(x, nb_filter=64, kernel_size=(3, 3))
#     x = identity_Block(x, nb_filter=64, kernel_size=(3, 3))
    
    #conv5_x
#     x = identity_Block(x, nb_filter=128, kernel_size=(3, 3), strides=(2, 2), with_conv_shortcut=True)    
#     x = identity_Block(x, nb_filter=128, kernel_size=(3, 3))
#     x = identity_Block(x, nb_filter=128, kernel_size=(3, 3))
    
    x = AveragePooling2D(pool_size=(7, 7))(x)
    x = Flatten()(x)
    x = Dense(classes, activation='softmax')(x)
    
    model = Model(inputs=inpt, outputs=x)
    return model
 

Resnet_model = resnet(H, W, 3, 5005)  
Resnet_model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 450, 1050, 3) 0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 224, 524, 16) 448         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 224, 524, 16) 64          conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 224, 524, 16) 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
max_poolin

In [27]:
import keras.optimizers as optimizer
from keras.callbacks import ModelCheckpoint

# Resnet_model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
Resnet_model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

epochs = 20
checkpointer = ModelCheckpoint(filepath='saved_models/resnet.hdf5')
Resnet_model.fit_generator(generator=train_generator, validation_data=valid_generator,
                           use_multiprocessing=True, workers=6)


Epoch 1/1
  2/330 [..............................] - ETA: 3:24:48 - loss: 7.7142 - acc: 0.2188

KeyboardInterrupt: 

Let's try Xception.

In [42]:
# Xception
import sys
from keras.layers import *
from keras.optimizers import *
from keras.applications import *
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import backend as K

In [43]:
# fix seed for reproducible results (only works on CPU, not GPU)
seed = 9
np.random.seed(seed=seed)
tf.set_random_seed(seed=seed)

# hyper parameters for model
# nb_classes = 5005
based_model_last_block_layer_number = 126  # value is based on based model selected.
batch_size = 16
epochs = 50
learning_rate = 1e-4
momentum = 0.9
transformation_ratio = 0.05

In [None]:
base_model = Xception(input_shape=(H, W, 3), 
                      weights='imagenet', include_top=False)
    
# Top Model Block
x = base_model.output
x = GlobalAveragePooling2D()(x)
predictions = Dense(classes, activation='softmax')(x)
    
# add top layer block to base model
Xception_model = Model(base_model.input, predictions)
Xception_model.summary()
    
# first: train only the top layers. freeze all layers of the based model that is already pre-trained.
for layer in base_model.layers:
    layer.trainable = False

Xception_model.compile(optimizer='sgd', 
                      loss='sparse_categorical_crossentropy', 
                      metrics=['accuracy'])

# save weights of best training epoch
top_weights_path = 'saved_models/top_xception.h5'
callbacks_list = [
    ModelCheckpoint(top_weights_path, monitor='val_acc', verbose=1, save_best_only=True),
    EarlyStopping(monitor='val_acc', patience=5, verbose=0)
]    

Xception_model.fit_generator(generator=train_generator, validation_data=valid_generator,
                           use_multiprocessing=True, workers=6, epochs=epochs/5, callbacks=callbacks_list)

print("\nStarting to Fine Tune Model\n")

Xception_model.load_weights(top_weights_path)

# based_model_last_block_layer_number points to the layer in the model
for layer in Xception_model[:based_model_last_block_layer_number]:
    layer.trainable = False
for layer in Xception_model[based_model_last_block_layer_number:]:
    layer.trainable = True

# compile the model with a sgd/momentum optimizer
# and a very slow learning rate.
Xception_model.compile(optimizer='sgd',
                      loss='sparse_catetorical_cross_entropy',
                      metrics=['accuracy'])
final_weights_path = 'saved_models/xception.h5'
callbacks_list = [
    ModelCheckpoint(final_weights_path, monitor='val_acc', verbose=1, save_best_only=True),
    EarlyStopping(monitor='val_loss', patience=5, verbose=0)
]
Xception_model.fit_generator(train_generator, epochs=epochs, 
                             validation_data=valid_generator, callbacks=callbacks_list)

# save model
model_json = model.to_json()
with open(DIR + 'model.json', 'w') as f:
    f.write(model_json)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 450, 1050, 3) 0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 224, 524, 32) 864         input_8[0][0]                    
__________________________________________________________________________________________________
block1_conv1_bn (BatchNormaliza (None, 224, 524, 32) 128         block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_conv1_act (Activation)   (None, 224, 524, 32) 0           block1_conv1_bn[0][0]            
__________________________________________________________________________________________________
block1_con

Epoch 1/50


Process PoolWorker-24:
Process PoolWorker-20:
Process PoolWorker-22:
Process PoolWorker-19:
Process PoolWorker-13:
Process PoolWorker-17:
Process PoolWorker-14:
Process PoolWorker-18:
Traceback (most recent call last):
Process PoolWorker-21:
Process PoolWorker-23:
Traceback (most recent call last):
Traceback (most recent call last):
Process PoolWorker-15:
Traceback (most recent call last):
Traceback (most recent call last):
Process PoolWorker-16:
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaco

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  Fil

Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 114, in run
Traceback (most recent call last

Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap

Exception in thread Thread-54:
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/pool.py", line 328, in _handle_workers
    pool._maintain_pool()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/pool.py", line 232, in _maintain_pool
    self._repopulate_pool()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/pool.py", line 225, in _repopulate_pool
    w.start()
  File "/Users/yangzhenxiong/anaconda3

Exception in thread Thread-55:
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/pool.py", line 328, in _handle_workers
    pool._maintain_pool()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/pool.py", line 232, in _maintain_pool
    self._repopulate_pool()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/pool.py", line 225, in _repopulate_pool
    w.start()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 130, in start
    self._popen = Popen(self)
  File "/Users/yangzhenxiong/anaconda3/envs/kaggl

Exception in thread Thread-55:
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/pool.py", line 328, in _handle_workers
    pool._maintain_pool()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/pool.py", line 232, in _maintain_pool
    self._repopulate_pool()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/pool.py", line 225, in _repopulate_pool
    w.start()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/process.py", line 130, in start
    self._popen = Popen(self)
  File "/Users/yangzhenxiong/anaconda3/envs/kaggl



  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/pool.py", line 102, in worker



    task = get()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/multiprocessing/queues.py", line 376, in get
    return recv()
KeyboardInterrupt
Error in sys.excepthook:
Error in sys.excepthook:
Error in sys.excepthook:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Error in sys.excepthook:
Error in sys.excepthook:
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 1736, in excepthook
Traceback (most recent call last):
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 1736, in excepthook
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/site-pac

    self.schedule(lambda : self._really_send(*args, **kwargs))
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/site-packages/ipykernel/iostream.py", line 205, in schedule
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/site-packages/jupyter_client/session.py", line 596, in sign
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/site-packages/ipykernel/iostream.py", line 205, in schedule
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/site-packages/ipykernel/iostream.py", line 205, in schedule
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/site-packages/ipykernel/iostream.py", line 205, in schedule
    return str_to_bytes(h.hexdigest())
    f()
    f()
    f()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/hmac.py", line 122, in hexdigest
    h = self.auth.copy()
  File "/Users/yangzhenxiong/anaconda3/envs/kaggle-rsna18/lib/python2.7/site-package

KeyboardInterrupt: 

KeyboardInterrupt: 

KeyboardInterrupt: 

KeyboardInterrupt: 

KeyboardInterrupt: 

KeyboardInterrupt: 

In [None]:
### Predictions
import warnings
from os.path import split
with open("sample_submission.csv", "w") as f:
    with warnings.catch_warnings():
        f.write("Image, Id\n")
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        for image in test_files:
            img = Image.open(image)
            img = img.astype("float32")/255
            img = img.resize((W, H), Image.ANTIALIAS)
            y = Resnet_model.predict(img)
            name_ids = np.argsort(y)[0][::-1][:5]
            ## todo 把 label 换成 name
            names = [names[i] for i in name_ids]
            image = split(image)[-1]
            names = " ".join(names)
            f.write("{},{}\n".format(image, names))

In [30]:
from sklearn.model_selection import train_test_split
# X_train, X_valid, y_train, y_valid = train_test_split(train_files, labels, test_size=0.2, random_state=42)

# print statistics about the dataset
print('There are %d total whale categories.' % train_df['Id'].nunique())
print('There are %d total whale images.\n' % len(train_df))
print('There are %d training whale images.' % len(X_train))
print('There are %d validation whale images.' % len(X_valid))
print('There are %d test whale images.'% len(test_files))


There are 5005 total whale categories.
There are 25361 total whale images.

There are 21175 training whale images.
There are 4186 validation whale images.
There are 7960 test whale images.
