<h1>Sommaire<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-data" data-toc-modified-id="Import-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import data</a></span></li><li><span><a href="#Prepare-data" data-toc-modified-id="Prepare-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Prepare data</a></span><ul class="toc-item"><li><span><a href="#Prepare-functions" data-toc-modified-id="Prepare-functions-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Prepare functions</a></span></li><li><span><a href="#Pipeline" data-toc-modified-id="Pipeline-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Pipeline</a></span></li></ul></li><li><span><a href="#LightGBM" data-toc-modified-id="LightGBM-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>LightGBM</a></span><ul class="toc-item"><li><span><a href="#Metric" data-toc-modified-id="Metric-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Metric</a></span></li><li><span><a href="#Find-best-hyper-parameters-by-cross-validation-and-sub-sampling" data-toc-modified-id="Find-best-hyper-parameters-by-cross-validation-and-sub-sampling-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Find best hyper-parameters by cross-validation and sub-sampling</a></span></li><li><span><a href="#Train-LGBM,-cross-validation-evaluation-and-predict-on-test-set" data-toc-modified-id="Train-LGBM,-cross-validation-evaluation-and-predict-on-test-set-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Train LGBM, cross-validation evaluation and predict on test set</a></span></li></ul></li><li><span><a href="#Neural-network" data-toc-modified-id="Neural-network-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Neural network</a></span><ul class="toc-item"><li><span><a href="#Compute-MoA-weights-from-train-set" data-toc-modified-id="Compute-MoA-weights-from-train-set-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Compute MoA weights from train set</a></span></li><li><span><a href="#Weight-features-from-LGBM-features-importances" data-toc-modified-id="Weight-features-from-LGBM-features-importances-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Weight features from LGBM features importances</a></span></li><li><span><a href="#Define-model-and-weighted-loss" data-toc-modified-id="Define-model-and-weighted-loss-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Define model and weighted loss</a></span></li><li><span><a href="#Dataset-generator" data-toc-modified-id="Dataset-generator-4.4"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Dataset generator</a></span></li><li><span><a href="#Train-folds" data-toc-modified-id="Train-folds-4.5"><span class="toc-item-num">4.5&nbsp;&nbsp;</span>Train folds</a></span></li></ul></li><li><span><a href="#Stacking-:-LGBM-+-NN" data-toc-modified-id="Stacking-:-LGBM-+-NN-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Stacking : LGBM + NN</a></span><ul class="toc-item"><li><span><a href="#Train-dataset" data-toc-modified-id="Train-dataset-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Train dataset</a></span></li><li><span><a href="#Test-set" data-toc-modified-id="Test-set-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Test set</a></span></li><li><span><a href="#Check-LGBM-and-NN-performance-on-val-folds" data-toc-modified-id="Check-LGBM-and-NN-performance-on-val-folds-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Check LGBM and NN performance on val folds</a></span></li><li><span><a href="#Train-Logistic-Regression-on-LGBM+NN-features" data-toc-modified-id="Train-Logistic-Regression-on-LGBM+NN-features-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Train Logistic Regression on LGBM+NN features</a></span></li></ul></li><li><span><a href="#Postprocess-prediction" data-toc-modified-id="Postprocess-prediction-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Postprocess prediction</a></span></li><li><span><a href="#Plot-results" data-toc-modified-id="Plot-results-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Plot results</a></span></li></ul></div>

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.special import softmax
from sklearn.model_selection import KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

from lightgbm import LGBMClassifier

from joblib import dump, load

import tensorflow as tf

# Import data

In [None]:
# data path
data_path = '../input/lish-moa/'
features_file = 'train_features.csv'
targets_file = 'train_targets_scored.csv'
no_targets_file = 'train_targets_nonscored.csv'
test_file = 'test_features.csv'

In [None]:
# get train data
df_features = pd.read_csv(os.path.join(data_path, features_file))
df_targets = pd.read_csv(os.path.join(data_path, targets_file))
df_no_targets = pd.read_csv(os.path.join(data_path, no_targets_file))

# get test data
df_test = pd.read_csv(os.path.join(data_path, test_file))

In [None]:
# keep columns names lists
# columns names = 'sig_id' + 'cp_type' + features_quali + features_quanti + scored_targets + no_scored_targets
scored_targets = list(set(df_targets.columns) - set(['sig_id']))
no_scored_targets = list(set(df_no_targets.columns) - set(['sig_id']))
features_quali = ['cp_time', 'cp_dose']
features_quanti = list(set(df_features.columns)
                       - set(scored_targets)
                       - set(no_scored_targets)
                       - set(features_quali)
                       - set(['sig_id', 'cp_type']))
print('Scored targets count : {}'.format(len(scored_targets)))
print('No scored targets count : {}'.format(len(no_scored_targets)))
print('Features quali count : {}'.format(len(features_quali)))
print('Features quanti count : {}'.format(len(features_quanti)))

In [None]:
# separate features_quanti : gene expression and cell viability features
cells = [feature_name for feature_name in features_quanti if feature_name.find(
    'c-') != -1]
genes = [feature_name for feature_name in features_quanti if feature_name.find(
    'g-') != -1]
print('Features genes count : {}'.format(len(genes)))
print('Features cells count : {}'.format(len(cells)))

In [None]:
# check sig_id is unique
test = df_features['sig_id'].is_unique
print('sig_id unique : {}'.format(test))

In [None]:
# check nan
test = df_features.isnull().values.any()
print('Missing data : {}'.format(test))

# Prepare data

## Prepare functions

In [None]:
# merge features and targets
def merge_features_targets(df_features, df_targets, df_no_targets):
    df_data = df_features.merge(
        df_targets, how='left', on='sig_id', validate='one_to_one')
    df_data = df_data.merge(df_no_targets, how='left',
                            on='sig_id', validate='one_to_one')
    print('- Merge features and targets')
    print('   Data shape : {}'.format(df_data.shape))
    return df_data

In [None]:
# separate compound and control
def separate_compound_control(df_data):
    '''
    input : dataframe features and targets
    '''
    df_compound = df_data[df_data['cp_type'] == 'trt_cp']
    df_control = df_data[df_data['cp_type'] == 'ctl_vehicle']
    print('- Separate compound and control')
    print('   Compound shape : {}'.format(df_compound.shape))
    print('   Control shape : {}'.format(df_control.shape))
    return df_compound, df_control

In [None]:
# onehot encoding qualitatives variables
def onehot(df_compound, features_quali, mean=None, std=None, train=True):
    onehot_data = pd.get_dummies(
        df_compound[features_quali], columns=['cp_dose'])
    features_onehot = list(onehot_data.columns)
    # standardisation 'cp_time'
    if mean == None:
        mean = onehot_data['cp_time'].mean()
    if std == None:
        std = onehot_data['cp_time'].std()
    onehot_data['cp_time'] = (onehot_data['cp_time'] - mean) / std
    # add onehot
    if train:
        df_compound = pd.concat([onehot_data, df_compound[[
                                'sig_id'] + features_quanti + scored_targets + no_scored_targets]], axis=1)
    else:
        df_compound = pd.concat(
            [onehot_data, df_compound[['sig_id'] + features_quanti]], axis=1)
    print('- Onehot encoding qualitatives variables')
    return df_compound, mean, std, features_onehot

## Pipeline

In [None]:
# train set : prepare data pipeline
df_data = merge_features_targets(df_features, df_targets, df_no_targets)
df_compound_train, df_control_train = separate_compound_control(df_data)
features = genes + cells
df_compound_train, mean, std, features_onehot = onehot(
    df_compound_train, features_quali)
print('Train set : compound shape : {}'.format(df_compound_train.shape))

In [None]:
# test set : prepare data pipeline
df_compound_test, df_control_test = separate_compound_control(df_test)
df_compound_test, mean, std, features_onehot = onehot(
    df_compound_test, features_quali, mean, std, train=False)
print('Compound shape : {}'.format(df_compound_test.shape))

In [None]:
# keep features list
#features = features_onehot + genes + cells + ['pca_cells']
features = features_onehot + genes + cells

In [None]:
# train set
X_train = df_compound_train[['sig_id'] + features]
print(X_train.shape)
Y_train = df_compound_train[scored_targets]
print(Y_train.shape)

In [None]:
# test set
X_test = df_compound_test[['sig_id'] + features]
print(X_test.shape)

In [None]:
# X_train = X_train.head(100)
# Y_train = Y_train.head(100)

# LightGBM

## Metric

In [None]:
def moa_metric(y_true, y_pred):
    y_true = y_true.astype('float64')
    y_pred = y_pred.astype('float64')
    y_pred = np.maximum(np.minimum(y_pred, 1. - 1e-15), 1e-15)
    return - np.mean((y_true * np.log(y_pred)) + ((1. - y_true) * np.log(1. - y_pred)))


# scikit scorer
metric = make_scorer(moa_metric, greater_is_better=False, needs_proba=True)

## Find best hyper-parameters by cross-validation and sub-sampling

In [None]:
# Best hyper-parameters
parameters = {
    'estimator__n_estimators': [100, 200, 500], 
    'estimator__learning_rate': [0.01, 0.001, 0.0001], 
    'estimator__max_depth': [4, 6, 8],
    'estimator__subsample': [0.5, 0.75, 1.],  
    'estimator__colsample_bytree': [0.6, 0.8, 1.]}  

# LigtGBM classifier
clf_gb = OneVsRestClassifier(LGBMClassifier(), n_jobs=-1)

clf = GridSearchCV(clf_gb,
                   cv=5,
                   scoring=metric,
                   verbose=1,
                   n_jobs=-1,
                   return_train_score=True,
                   param_grid=parameters)

# %time clf.fit(X_train[features].values[:200,...], Y_train[scored_targets].values[:200,...])
# pd.DataFrame(clf.cv_results_)

In [None]:
# print('Best hyper-parameters : {}'.format(clf.best_params_))
# print('Metric on val set : {}'.format(clf.best_score_))

# Best hyper-parameters : {'estimator__colsample_bytree': 0.6, 'estimator__learning_rate': 0.01, 'estimator__max_depth': 4, 'estimator__n_estimators': 100, 'estimator__subsample': 0.5}
# Metric on val set : -0.05989294552932979

## Train LGBM, cross-validation evaluation and predict on test set

In [None]:
# folds
n_splits = 5
skf = KFold(n_splits=n_splits, random_state=1, shuffle=True)

lgbm_models = {}
features_importances = {}

# train each folds
for n_fold, (train_index, test_index) in enumerate(skf.split(X_train.values, Y_train.values)):
    X_train_fold = X_train[features].values[train_index]
    Y_train_fold = Y_train[scored_targets].values[train_index]
    X_val_fold = X_train[features].values[test_index]
    Y_val_fold = Y_train[scored_targets].values[test_index]
    # define model
    clf_lgbm = OneVsRestClassifier(LGBMClassifier(colsample_bytree=0.6, # best hyper-parameters
                                                  learning_rate=0.01,
                                                  max_depth=4,
                                                  n_estimators=100,
                                                  subsample=0.5), n_jobs=-1)
    # train on train set fold
    print('Train fold : {}'.format(n_fold + 1))
    clf_lgbm.fit(X=X_train_fold,
                 y=Y_train_fold)

    # get features importance for each estimators
    importances = np.zeros((X_train_fold.shape[1],))
    for i in range(len(scored_targets)):
        count_estimators = 0
        try:  # sometimes no label in dataset and no estimator...
            importances = importances + \
                clf_lgbm.estimators_[i].feature_importances_
            count_estimators = count_estimators + 1
        except:
            pass
    if count_estimators > 0:
        importances = importances / count_estimators
    else:
        importances = None
    # save features importance for each fold
    features_importances['fold_{}'.format(n_fold + 1)] = importances

    # evaluation on train fold
    Y_pred_train_fold = clf_lgbm.predict_proba(X_train_fold)
    metric_train_fold = moa_metric(Y_train_fold, Y_pred_train_fold)
    print('Metric on train fold : {}'.format(metric_train_fold))

    # prediction and evaluation on val fold
    Y_pred_val_fold = clf_lgbm.predict_proba(X_val_fold)
    metric_val_fold = moa_metric(Y_val_fold, Y_pred_val_fold)
    Y_pred_val_fold = pd.DataFrame(Y_pred_val_fold, columns=scored_targets)
    Y_pred_val_fold['sig_id'] = X_train['sig_id'].values[test_index]
    print('Metric on validation fold : {}'.format(metric_val_fold))

    # prediction on test set
    Y_pred_test_fold = clf_lgbm.predict_proba(X_test[features])
    Y_pred_test_fold = pd.DataFrame(Y_pred_test_fold, columns=scored_targets)
    Y_pred_test_fold['sig_id'] = X_test['sig_id'].values

    # keep predictions and metric
    lgbm_models['fold_{}'.format(n_fold + 1)] = [metric_train_fold,
                                                 metric_val_fold,
                                                 Y_pred_val_fold[[
                                                     'sig_id'] + scored_targets],
                                                 Y_pred_test_fold[['sig_id'] + scored_targets]]

In [None]:
# get features importances (used by neural network model)
importances = np.array(
    [importance_fold for importance_fold in features_importances.values()])
importances = np.sum(importances, axis=0) / importances.shape[0]
importances = importances / np.max(importances)
importances.shape

In [None]:
plt.hist(importances)

In [None]:
features = features_onehot + genes + cells
df_importances = pd.DataFrame(
    importances, index=features, columns=['importance'])
df_importances = df_importances.sort_values(by='importance', ascending=False)
df_plot_imp_max = df_importances.head(20)
df_plot_imp_min = df_importances.tail(20)

In [None]:
ax = df_plot_imp_max.plot.bar()
ax.set_title('Features importance from LGBM\n(10 most important features)')
ax.set_ylabel('Importance')
ax.set_xlabel('Features')

In [None]:
ax = df_plot_imp_min.plot.bar()
ax.set_title('Features importance from LGBM\n(10 least important features)')
ax.set_ylabel('Importance')
ax.set_xlabel('Features')

In [None]:
# concatenate prediction on train set folds (used as features during stacking)
df_lgbm_pred = pd.concat([model[2] for model in lgbm_models.values()])
df_lgbm_pred.head(5)

# Neural network

## Compute MoA weights from train set
(needed during loss)

In [None]:
# get moa weights from train set
# weight : ]0,1] 1 is for less present class
occurence = np.sum(Y_train[scored_targets].values, axis=0)
max_occurence = np.max(occurence)
weights = 1 + (max_occurence - occurence) / max_occurence
weights

## Weight features from LGBM features importances

In [None]:
# weight features
X_train[features] = X_train[features].copy() * importances
X_test[features] = X_test[features].copy() * importances

## Define model and weighted loss

In [None]:
# nn architecture


def get_model(input_shape):

    inputs = tf.keras.Input(input_shape)
    x = tf.keras.layers.BatchNormalization()(inputs)
    x = tf.keras.layers.Dropout(0.6)(x)
    x = tf.keras.layers.Dense(2048, activation="relu")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.6)(x)
    x = tf.keras.layers.Dense(1024, activation="relu")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.6)(x)
    outputs = tf.keras.layers.Dense(
        len(scored_targets), activation="sigmoid")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    return model

In [None]:
def tf_moa_metric(y_true, y_pred):
    y_true = tf.cast(y_true, dtype='float64')
    y_pred = tf.cast(y_pred, dtype='float64')
    y_pred = tf.maximum(tf.minimum(y_pred, 1. - 1e-15), 1e-15)
    return - tf.math.reduce_mean((y_true * tf.math.log(y_pred)) + ((1. - y_true) * tf.math.log(1. - y_pred)))

In [None]:
# custom loss = weighted loss
def tf_moa_loss(y_true, y_pred):
    y_true = tf.cast(y_true, dtype='float64')
    y_pred = tf.cast(y_pred, dtype='float64')
    y_pred = tf.maximum(tf.minimum(y_pred, 1. - 1e-15), 1e-15)
    log_loss = (y_true * tf.math.log(y_pred)) + \
        ((1. - y_true) * tf.math.log(1. - y_pred))
    log_loss_weighted = log_loss * weights
    return - tf.math.reduce_mean(log_loss_weighted)

## Dataset generator

In [None]:
def get_dataset(X_train, Y_train, X_test, Y_test, batch_size):
    #  train dataset
    ds_train = tf.data.Dataset.from_tensor_slices(
        (X_train.astype(float), Y_train.astype(float)))
    ds_train = ds_train.shuffle(X_train.shape[0])
    ds_train = ds_train.batch(batch_size)
    ds_train = ds_train.prefetch(batch_size * 2)
    # test dataset
    ds_test = tf.data.Dataset.from_tensor_slices(
        (X_test.astype(float), Y_test.astype(float)))
    ds_test = ds_test.batch(X_test.shape[0])

    return ds_train, ds_test

## Train folds

In [None]:
batch_size = 32
epochs = 75

skf = KFold(n_splits=n_splits, random_state=1, shuffle=True)
nn_models = {}

# train each folds
for n_fold, (train_index, test_index) in enumerate(skf.split(X_train.values, Y_train.values)):
    X_train_fold = X_train[features].values[train_index]
    Y_train_fold = Y_train[scored_targets].values[train_index]
    X_val_fold = X_train[features].values[test_index]
    Y_val_fold = Y_train[scored_targets].values[test_index]

    # get dataset
    ds_train, ds_val = get_dataset(
        X_train_fold, Y_train_fold, X_val_fold, Y_val_fold, batch_size)

    # get model
    model = get_model(X_train_fold.shape[1])

    # optimizer
    model.compile(optimizer=tf.keras.optimizers.Adam(0.01),
        loss=tf_moa_loss,
        metrics=[tf_moa_metric])

    # callback
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, mode='min',
                                                     patience=5, min_lr=0.00001, verbose=1)
    checkpoint_path = 'weights_fold_{}.hdf5'.format(n_fold)
    cb_checkpt = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=0, save_best_only=True,
                                                    save_weights_only=True, mode='min')

    # train
    print('Train fold : {}'.format(n_fold + 1))
    history = model.fit(x=ds_train, epochs=epochs,
                        validation_data=ds_val, callbacks=[reduce_lr, cb_checkpt])

    # load best weights
    model.load_weights(checkpoint_path)

    # evaluate on train fold
    Y_pred_train_fold = model.predict(X_train_fold)
    metric_train_fold = tf_moa_metric(Y_train_fold, Y_pred_train_fold)
    print('Metric on train fold : {}'.format(metric_train_fold))

    # predict and evaluate on val fold
    Y_pred_val_fold = model.predict(X_val_fold)
    metric_val_fold = tf_moa_metric(Y_val_fold, Y_pred_val_fold)
    Y_pred_val_fold = pd.DataFrame(Y_pred_val_fold, columns=scored_targets)
    Y_pred_val_fold['sig_id'] = X_train['sig_id'].values[test_index]
    print('Metric on validation fold : {}'.format(metric_val_fold))

    # predict on test set
    Y_pred_test_fold = model.predict(X_test[features])
    Y_pred_test_fold = pd.DataFrame(Y_pred_test_fold, columns=scored_targets)
    Y_pred_test_fold['sig_id'] = X_test['sig_id'].values

    # keep predictions and metric
    nn_models['fold_{}'.format(n_fold + 1)] = [metric_train_fold,
                                               metric_val_fold,
                                               Y_pred_val_fold[[
                                                   'sig_id'] + scored_targets],
                                               Y_pred_test_fold[['sig_id'] + scored_targets]]

In [None]:
# concatenate prediction on train set folds (used as features during stacking)
df_nn_pred = pd.concat([nn_model[2] for nn_model in nn_models.values()])
df_nn_pred.head(5)

# Stacking : LGBM + NN

## Train dataset

In [None]:
# sort datasets before merge
df_nn_pred = df_nn_pred.sort_values(by='sig_id')
df_lgbm_pred = df_lgbm_pred.sort_values(by='sig_id')

# add sig_id features to Y_train
Y_train['sig_id'] = df_compound_train['sig_id'].copy()
Y_train_stack = Y_train.sort_values(by='sig_id')

# merge lgbm and nn
X_train_stack = df_nn_pred.merge(
    df_lgbm_pred, how='left', on='sig_id', validate='one_to_one')
X_train_stack = X_train_stack.sort_values(by='sig_id')

In [None]:
# define features names

features_stack = list(set(X_train_stack.columns) - set(['sig_id']))
print(len(features_stack))

features = list(set(df_lgbm_pred.columns) - set(['sig_id']))
print(len(features))

## Test set

In [None]:
# concatenate and mean LGBM and NN predictions on test set

# nn model
X_test_nn = pd.concat([nn_model[3] for nn_model in nn_models.values()])
X_test_nn = X_test_nn.groupby('sig_id').mean()
X_test_nn = X_test_nn.reset_index(col_fill='sig_id')
# lgbm model
X_test_lgbm = pd.concat([lgbm_model[3] for lgbm_model in lgbm_models.values()])
X_test_lgbm = X_test_lgbm.groupby('sig_id').mean()
X_test_lgbm = X_test_lgbm.reset_index(col_fill='sig_id')

# merge LGBM and NN features
X_test_stack = X_test_nn.merge(
    X_test_lgbm, how='left', on='sig_id', validate='one_to_one')

## Check LGBM and NN performance on val folds

In [None]:
# val metric nn
np.mean([nn_model[1] for nn_model in nn_models.values()])

In [None]:
# val metric lgbm
np.mean([lgbm_model[1] for lgbm_model in lgbm_models.values()])

## Train Logistic Regression on LGBM+NN features

In [None]:
n_splits = 5
skf = KFold(n_splits=n_splits, random_state=1, shuffle=True)

models_stack = {}

for n_fold, (train_index, test_index) in enumerate(skf.split(X_train_stack.values, Y_train_stack.values)):
    X_train_fold = X_train_stack[features_stack].values[train_index]
    Y_train_fold = Y_train[scored_targets].values[train_index]
    X_val_fold = X_train_stack[features_stack].values[test_index]
    Y_val_fold = Y_train[scored_targets].values[test_index]

    # get model
    clf_rl = OneVsRestClassifier(LogisticRegression(n_jobs=-1))

    # train
    print('Train fold : {}'.format(n_fold + 1))
    clf_rl.fit(X=X_train_fold,
               y=Y_train_fold)

    # evaluate on train fold
    Y_pred_train_fold = clf_rl.predict_proba(X_train_fold)
    metric_train_fold = moa_metric(Y_train_fold, Y_pred_train_fold)
    print('Metric on train fold : {}'.format(metric_train_fold))

    # predict and evaluate on val fold
    Y_pred_val_fold = clf_rl.predict_proba(X_val_fold)
    metric_val_fold = moa_metric(Y_val_fold, Y_pred_val_fold)
    Y_pred_val_fold = pd.DataFrame(Y_pred_val_fold, columns=scored_targets)
    Y_pred_val_fold['sig_id'] = X_train['sig_id'].values[test_index]
    print('Metric on validation fold : {}'.format(metric_val_fold))

    # predict on test set
    Y_pred_test_fold = clf_rl.predict_proba(X_test_stack[features_stack])
    Y_pred_test_fold = pd.DataFrame(Y_pred_test_fold, columns=scored_targets)
    Y_pred_test_fold['sig_id'] = X_test_stack['sig_id'].values

    # keep predictions and metric
    models_stack['fold_{}'.format(n_fold + 1)] = [metric_train_fold,
                                                  metric_val_fold,
                                                  Y_pred_val_fold[[
                                                      'sig_id'] + scored_targets],
                                                  Y_pred_test_fold[['sig_id'] + scored_targets]]

In [None]:
np.mean([model[0] for model in models_stack.values()])

# Postprocess prediction

In [None]:
# concatenate and mean prediction on test dataset
pred_compound = pd.concat([stack_model[3]
                           for stack_model in models_stack.values()])
pred_compound = pred_compound.groupby('sig_id').mean()
pred_compound = pred_compound.reset_index(col_fill='sig_id')

# add control prediction (equal 0 !) to compound prediction
Y_pred_control = np.zeros((df_control_test.shape[0], len(scored_targets)))

# get sig_id
#pred_compound = np.concatenate((np.expand_dims(df_compound_test['sig_id'].values, axis=1), Y_pred_compound[:,0:len(scored_targets)]), axis=1)
pred_control = np.concatenate((np.expand_dims(
    df_control_test['sig_id'].values, axis=1), Y_pred_control), axis=1)
pred_control = pd.DataFrame(pred_control, columns=['sig_id'] + scored_targets)

# merge control pred and control pred
df_pred = pd.concat([pred_compound, pred_control], axis=0)

# write submission file
df_pred.to_csv('submission.csv', index=False)

# Plot results

In [None]:
# plot
models = ['LGBM', 'NN', 'Stacking']
train = [
    np.mean([model[0] for model in lgbm_models.values()]),
    np.mean([model[0] for model in nn_models.values()]),
    np.mean([model[0] for model in models_stack.values()])
]
val = [
    np.mean([model[1] for model in lgbm_models.values()]),
    np.mean([model[1] for model in nn_models.values()]),
    np.mean([model[1] for model in models_stack.values()])
]
test = [0, 0, 0]

x = np.arange(len(models))  # the label locations
width = 1 / len(train)  # the width of the bars

fig, ax = plt.subplots(figsize=(8, 5))
rects1 = ax.bar(x - width, train, width, label='train')
rects2 = ax.bar(x, val, width, label='val')
rects3 = ax.bar(x + width, test, width, label='test')

ax.set_ylabel('loss')
ax.set_title('log loss for train, val and test dataset')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()