In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model, Sequential, load_model
from keras.layers import Lambda, Activation, Dropout, Flatten, Dense, GlobalMaxPooling2D, BatchNormalization, Input, Conv2D
from keras.applications.inception_v3 import InceptionV3
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
from keras import metrics
from keras.optimizers import Adam 
from keras import backend as K
from keras.layers import Activation, Dropout, Flatten, Dense, Input, Conv2D, MaxPooling2D, BatchNormalization, Concatenate, ReLU, LeakyReLU

In [None]:
from sklearn.model_selection import train_test_split

## Data donwloading and preprocessing

In [None]:
metadata = pd.read_csv('../input/human-protein-atlas-image-classification/train.csv')
metadata = metadata.rename(columns={'Id': 'id', 'Target': 'target'})

submission = pd.read_csv('../input/human-protein-atlas-image-classification/sample_submission.csv')
submission = submission.rename(columns={'Id': 'id', 'Target': 'target'})
submission = submission.astype(str)

In [None]:
labels = {
    '0' : 'Nucleoplasm', 
    '1' : 'Nuclear membrane',   
    '2' : 'Nucleoli', 
    '3' : 'Nucleoli fibrillar center',   
    '4' : 'Nuclear speckles',   
    '5' : 'Nuclear bodies',   
    '6' : 'Endoplasmic reticulum',   
    '7' : 'Golgi apparatus',   
    '8' : 'Peroxisomes',   
    '9' : 'Endosomes',   
    '10' : 'Lysosomes',   
    '11' : 'Intermediate filaments',   
    '12' : 'Actin filaments',   
    '13' : 'Focal adhesion sites',  
    '14' : 'Microtubules',   
    '15' : 'Microtubule ends',   
    '16' : 'Cytokinetic bridge',   
    '17' : 'Mitotic spindle',   
    '18' : 'Microtubule organizing center',   
    '19' : 'Centrosome',   
    '20' : 'Lipid droplets',   
    '21' : 'Plasma membrane',   
    '22' : 'Cell junctions',   
    '23' : 'Mitochondria',   
    '24' : 'Aggresome',   
    '25' : 'Cytosol',   
    '26' : 'Cytoplasmic bodies',   
    '27' : 'Rods & rings',  
}

channels = {
    0: 'Microtubules',
    1: 'Nucleus',
    2: 'Protein',
    3: 'Endoplasmic reticulum'
}

channels_marker = {
    0: 'Red',
    1: 'Blue',
    2: 'Green',
    3: 'Yellow'
}

cmap_markers = {
    0: 'Reds',
    1: 'Blues',
    2: 'Greens',
    3: 'YlOrBr'
}

In [None]:
metadata = pd.concat([metadata, 
           pd.get_dummies(metadata.target.apply(lambda x: x.split()).apply(pd.Series).stack()).sum(level=0).sort_index(axis=1)],
         axis=1)
metadata = metadata.drop(columns='target')
metadata = metadata.rename(columns=labels)

In [None]:
def download_data(names, folder='train'):
    data = []
    for name in names:
        full_name = os.path.join(os.getcwd(), folder, name)
        image_red = np.array(Image.open(full_name + '_red.png'))
        image_blue = np.array(Image.open(full_name + '_blue.png'))
        image_green = np.array(Image.open(full_name + '_green.png'))
        image_yellow= np.array(Image.open(full_name + '_yellow.png'))
        image = np.dstack((image_red, image_blue, image_green, image_yellow))
        data.append(image)
    data = np.array(data)
    return data

def labels_to_str(labels_tensor, mapping=labels):
    '''labels_tensor - batch_size x 27 matrix with labels'''
    '''result: string view of the labels'''
    result = map(lambda x: '\n'.join(list(map(lambda y: mapping[y], 
                                               np.where(x == 1)[0].astype(str)))), 
                 labels_tensor)
    result = list(result)
    return result 

def generate_batch(batch_size=3, metadata=metadata):
    n_batches = np.ceil(metadata.shape[0] / batch_size).astype(int)
    for i in range(n_batches):
        labels = metadata.iloc[i*(batch_size): (i+1)*batch_size].iloc[:, 1:].values
        names = metadata.iloc[i*(batch_size): (i+1)*batch_size].id
        yield download_data(names), labels
        
def plot_samples(data):
    samples, labels = data
    labels_str = labels_to_str(labels)
    n_samples = samples.shape[0]
    n_channels = samples.shape[-1]
    fig, axes = plt.subplots(n_samples, n_channels+1, figsize=(16, 16))
    for i in range(n_samples):
        for j in range(n_channels):
            axes[i, j].set_title(labels_str[i])
            axes[i, j].set(xlabel=channels[j], ylabel=channels_marker[j])
            axes[i, j].imshow(samples[i, :, :, j], cmap=cmap_markers[j])
        axes[i, n_channels].set_title('RGB image')
        axes[i, n_channels].imshow(samples[i, :, :, :-1])
        
def plot_rgb(data):
    samples, labels = data
    labels_str = labels_to_str(labels)
    n_samples = samples.shape[0]
    fig, axes = plt.subplots(1, n_samples, figsize=(16, 16))
    for j in range(n_samples):
        axes[j].set_title(labels_str[j])
        axes[j].set(xlabel='RGB')
        axes[j].imshow(samples[j, :, :, :-1])

## Data modeling

In [None]:
images = np.load('../input/proteins128/train128.npy') 
labels = metadata.iloc[:, 1:].values

In [None]:
# some basic useless model
def create_model(input_shape):
    
    dropRate = 0.4
    
    init = Input(input_shape)
    x = Lambda(lambda x: x / 255.0)(init)
    x = BatchNormalization(axis=-1)(init)
    x = Conv2D(8, (3, 3))(x)
    x = ReLU()(x)
    x = BatchNormalization(axis=-1)(x)
    x = Conv2D(8, (3, 3))(x)
    x = ReLU()(x)
    x = BatchNormalization(axis=-1)(x)
    x = Conv2D(16, (3, 3))(x)
    x = ReLU()(x)
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(dropRate)(x)
    c1 = Conv2D(16, (3, 3), padding='same')(x)
    c1 = ReLU()(c1)
    c2 = Conv2D(16, (5, 5), padding='same')(x)
    c2 = ReLU()(c2)
    c3 = Conv2D(16, (7, 7), padding='same')(x)
    c3 = ReLU()(c3)
    c4 = Conv2D(16, (1, 1), padding='same')(x)
    c4 = ReLU()(c4)
    x = Concatenate()([c1, c2, c3, c4])
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(dropRate)(x)
    x = Conv2D(32, (3, 3))(x)
    x = ReLU()(x)
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(dropRate)(x)
    x = Conv2D(64, (3, 3))(x)
    x = ReLU()(x)
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(dropRate)(x)
    x = Conv2D(128, (3, 3))(x)
    x = ReLU()(x)
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(dropRate)(x)
    #x = Conv2D(256, (1, 1), activation='relu')(x)
    #x = BatchNormalization(axis=-1)(x)
    #x = MaxPooling2D(pool_size=(2, 2))(x)
    #x = Dropout(0.25)(x)
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(28)(x)
    x = ReLU()(x)
    x = BatchNormalization(axis=-1)(x)
    x = Dropout(0.1)(x)
    x = Dense(28)(x)
    x = Activation('sigmoid')(x)
    
    model = Model(init, x)
    
    return model

In [None]:
model = create_model((128, 128, 4))

In [None]:
model.summary()

In [None]:
from tensorflow import where, is_nan, zeros_like, ones_like, equal
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = where(is_nan(f1), zeros_like(f1), f1)
    return K.mean(f1)

def focal_loss(gamma=2.0, alpha=0.75):
    def focal_loss_fixed(y_true, y_pred):
        eps = 1e-4
        y_pred = K.clip(y_pred, eps, 1.0 - eps)
        pt_1 = where(equal(y_true, 1), y_pred, ones_like(y_pred))
        pt_0 = where(equal(y_true, 0), y_pred, zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.mean((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

def conv_loss():
    def f(y_true, y_pred):
        return 0.1 * fl(y_true, y_pred) + 0.9 * K.binary_crossentropy(y_true, y_pred)
    return f

fl = focal_loss()
fc = conv_loss()

In [None]:
checkpoint = ModelCheckpoint('weights.h5', 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min', 
                             save_weights_only = True)

'''''reduceLROnPlato = ReduceLROnPlateau(monitor='val_loss', 
                                   factor=0.1, 
                                   patience=3, 
                                   verbose=1, 
                                   mode='auto', 
                                   epsilon=0.0001)'''''

early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=10)
callbacks_list = [checkpoint, early]

In [None]:
epochs = 20
batch_size = 128

In [None]:
images_train, images_test, labels_train, labels_test = train_test_split(images, labels, test_size=0.2, random_state=52)

In [None]:
model.compile(loss=['binary_crossentropy'],
            optimizer=Adam(lr=1e-4),
            metrics=[fl, 'acc'])

In [None]:
model.fit(images_train, 
          labels_train, 
          batch_size, 
          epochs, 
          callbacks=callbacks_list, 
          validation_data=(images_test, labels_test))

## Prepredicting

In [None]:
pred_train = model.predict(images_train)

In [None]:
from sklearn.metrics import f1_score

thresholds = np.linspace(0, 1, 200)
th = []
score = 0.0
best_threshold=0.0
best_val = 0.0
for i in range(28):
    best_threshold=0.0
    best_val = 0.0
    for threshold in thresholds:
        score = f1_score(labels_train[:, i], pred_train[:, i] > threshold, average='macro')
        if score > best_val:
            best_threshold = threshold
            best_val = score
    th.append(best_threshold)
th = np.array(th)

In [None]:
th

In [None]:
f1_score(labels_test, model.predict(images_test) > np.repeat([th], labels_test.shape[0], axis=0), average='macro')

In [None]:
test = np.load('../input/proteins128/test128.npy')

In [None]:
def create_submission(model, test_images, test_csv, thresholds):
    for i, image in enumerate(test_images):
        y_pred = model.predict(np.expand_dims(image, axis=0)) > thresholds
        test_csv.at[i, 'Predicted'] = ' '.join(np.where(y_pred[0])[0].astype(str))
    return test_csv

In [None]:
create_submission(model, test, submission, th).to_csv('submission.csv', index=False)

In [None]:
submission