In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [29]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
from os import listdir
from os.path import isfile, join
import random 
from tqdm import tqdm
import cv2
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [3]:
# Example: load a DSS dataset as a Pandas dataframe
mydataset = dataiku.Dataset("train_labels")
labels = mydataset.get_dataframe()
labels.head(5)

Unnamed: 0,name,invasive
0,1,0
1,2,0
2,3,1
3,4,0
4,5,1


## Basic Keras model in memory

### Loading the data

In [4]:
# size of image
smallimg_size = 100

In [5]:
# get images list
mypath = "/data/pgutierrez/invasive/train"
images = [f for f in listdir(mypath) if isfile(join(mypath, f))]
random.shuffle(images)
print len(images)

2295


In [6]:
x_train = []
y_train = []

for f in tqdm(images, miniters=100):
    im_number = int(f.split('.')[0])
    y_train.append(int(labels[labels['name']==im_number]['invasive']))   
    img = cv2.imread(mypath + "/"  + f.format(f))
    x_train.append(cv2.resize(img, (smallimg_size, smallimg_size)))
    
y_train = np.array(y_train)
x_train = np.array(x_train, np.float16) / 255.

print(x_train.shape)
print(y_train.shape)

split = 1836
x_train, x_valid, y_train, y_valid = x_train[:split], x_train[split:], y_train[:split], y_train[split:]

print(x_valid.shape)
print(y_valid.shape)

100%|██████████| 2295/2295 [00:50<00:00, 45.50it/s]


(2295, 100, 100, 3)
(2295,)
(459, 100, 100, 3)
(459,)


### Creating Keras model

In [35]:
import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers

model = Sequential()

model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',
                 input_shape=(smallimg_size,smallimg_size, 3)))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])


# save model weights for re init
model.save_weights('innitial.h5')


### Training model

In [41]:
model.load_weights('innitial.h5')

model.fit(x_train, y_train,
          batch_size=64,
          epochs=20,
          verbose=1,
          validation_data=(x_valid, y_valid))

Train on 1836 samples, validate on 459 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x10115710>

In [42]:
yprobas = model.predict_proba(x_valid)
ypred = [0 if x <0.5 else 1 for x in yprobas]
print "AUC: ", roc_auc_score(y_valid, yprobas, average='macro', sample_weight=None)
print classification_report(y_valid,ypred)

             precision    recall  f1-score   support

          0       0.84      0.89      0.87       171
          1       0.93      0.90      0.92       288

avg / total       0.90      0.90      0.90       459



### Early stopping

Not bad for a first model ! However we missed the best one. Let's add some early stopping using keras callbacks. 

In [46]:
# load innit model weights
model.load_weights('innitial.h5')

es = k.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')

model.fit(x_train, y_train,
          batch_size=64,
          epochs=10,
          verbose=1,
          validation_data=(x_valid, y_valid),
          callbacks = [es])

Train on 1836 samples, validate on 459 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0xe396f10>

In [47]:
yprobas = model.predict_proba(x_valid)
ypred = [0 if x <0.5 else 1 for x in yprobas]
print "AUC: ", roc_auc_score(y_valid, yprobas, average='macro', sample_weight=None)
print classification_report(y_valid,ypred)

             precision    recall  f1-score   support

          0       0.88      0.87      0.87       171
          1       0.92      0.93      0.92       288

avg / total       0.90      0.90      0.90       459



### Data augmentation

We may overfit quite fast because we have only a few lines. Let's try to augment the data ! 

In [17]:
# wow ! awesome functionality ! 
from keras.preprocessing.image import ImageDataGenerator


# This will do preprocessing and realtime data augmentation:
datagen = ImageDataGenerator(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    rotation_range=180,  # randomly rotate images in the range (degrees, 0 to 180)
    width_shift_range=0.3,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=0.3,  # randomly shift images vertically (fraction of total height)
    horizontal_flip=True,  # randomly flip images
    vertical_flip=True)  # randomly flip images


In [19]:
# load innit model weights
model.load_weights('innitial.h5') 

# early stopping
es = k.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

# fits the model on batches with real-time data augmentation:
model.fit_generator(datagen.flow(x_train, y_train, batch_size=64),
                    steps_per_epoch=len(x_train) / 64, epochs=30,validation_data=(x_valid, y_valid)
                    ,callbacks=[es])


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30


<keras.callbacks.History at 0xcf4a090>

In [34]:
yprobas = model.predict_proba(x_valid)
ypred = [0 if x <0.5 else 1 for x in yprobas]
print "AUC: ", roc_auc_score(y_valid, yprobas, average='macro', sample_weight=None)
print classification_report(y_valid,ypred)

             precision    recall  f1-score   support

          0       0.88      0.92      0.90       171
          1       0.95      0.93      0.94       288

avg / total       0.92      0.92      0.92       459



Wow ! Clear improovement ! 

Let's try to improove it with further data augmentation ! 

In [56]:
# wow ! awesome functionality ! 
from keras.preprocessing.image import ImageDataGenerator


# This will do preprocessing and realtime data augmentation:
datagen = ImageDataGenerator(

    
    zoom_range = 0.5,
    
    rotation_range=180,  # randomly rotate images in the range (degrees, 0 to 180)
    width_shift_range=0.3,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=0.3,  # randomly shift images vertically (fraction of total height)
    horizontal_flip=True,  # randomly flip images
    vertical_flip=True)  # randomly flip images

#datagen.fit(x_train)

In [57]:
# load innit model weights
model.load_weights('innitial.h5') 

# early stopping
es = k.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

# fits the model on batches with real-time data augmentation:
model.fit_generator(datagen.flow(x_train, y_train, batch_size=64),
                    steps_per_epoch=len(x_train) / 64, epochs=30,validation_data=(x_valid, y_valid)
                    ,callbacks=[es])


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0xfb4c410>

In [58]:
yprobas = model.predict_proba(x_valid)
ypred = [0 if x <0.5 else 1 for x in yprobas]
print "AUC: ", roc_auc_score(y_valid, yprobas, average='macro', sample_weight=None)
print classification_report(y_valid,ypred)

             precision    recall  f1-score   support

          0       0.87      0.96      0.91       171
          1       0.98      0.91      0.94       288

avg / total       0.94      0.93      0.93       459



### Submit!

Let's do our first submission ! 

In [59]:
# get images list
mypath = "/data/pgutierrez/invasive/test"
images = [f for f in listdir(mypath) if isfile(join(mypath, f))]
random.shuffle(images)
print len(images)

x_test = []
names = []

for f in tqdm(images, miniters=100):
    im_number = int(f.split('.')[0])
    names.append(im_number)
    img = cv2.imread(mypath + "/"  + f.format(f))
    x_test.append(cv2.resize(img, (smallimg_size, smallimg_size)))
    
x_test = np.array(x_test, np.float16) / 255.

print(x_test.shape)


  0%|          | 0/1531 [00:00<?, ?it/s]

1531


100%|██████████| 1531/1531 [00:31<00:00, 49.29it/s]


(1531, 100, 100, 3)


In [73]:
# get the preds
yprobas = model.predict_proba(x_test)



In [78]:
sub = pd.DataFrame(np.stack([names,yprobas.reshape(1531,)],axis=1),columns = ["name","invasive"])
sub["name"]=sub["name"].astype(int)
sub 

Unnamed: 0,name,invasive
0,926,0.999907
1,1160,0.063761
2,500,0.119699
3,110,0.642412
4,1393,0.113904
5,696,1.000000
6,231,0.959795
7,639,0.234656
8,1256,0.129205
9,912,0.122278


### Can we go deeper ? 

Let's try in another notebook!