In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
pd.set_option('display.max_columns', 500)

In [66]:
os.listdir('Raw')

['CTNNB1.csv',
 'LEF1.csv',
 'Jnk.csv',
 'Erk.csv',
 'IRF4.csv',
 'MYC.csv',
 'Ikk2.csv']

In [3]:
from utils import load_fold
train, test, test_fold = load_fold('Raw', 5)

Test fold: MYC.csv
Train fold: CTNNB1.csv
Train fold: LEF1.csv
Train fold: Jnk.csv
Train fold: Erk.csv
Train fold: IRF4.csv
Train fold: Ikk2.csv


In [8]:
from utils import outer_product
from utils import group_data
X_train, y_train = group_data(outer_product(train))
X_test, y_test = group_data(outer_product(test))

In [6]:
X_train.shape, X_test.shape

((118263, 6, 81), (10667, 6, 81))

In [21]:
from utils import sklearn_train_test_split
X_tra, X_val, y_tra, y_val = sklearn_train_test_split(
                X_train, y_train, 0.1, random_state=42,
                shuffle=True, stratify=y_train)

In [22]:
X_tra.shape, X_val.shape, y_tra.shape, y_val.shape

((106436, 6, 81), (11827, 6, 81), (106436, 6), (11827, 6))

## CNN

In [26]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import GlobalAveragePooling2D
from keras.layers import MaxPooling2D
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import BatchNormalization
from keras.layers import Activation

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [27]:
def create_model():
    model = Sequential()

    model.add(Conv2D(16, (3, 3), padding='same', input_shape=(9, 9, 1)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Conv2D(32, (3, 3), strides=2))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Conv2D(32, (1, 1)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Conv2D(64, (3, 3)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    
    model.add(Flatten())
    
    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
       
#     model.add(GlobalAveragePooling2D())

    model.add(Dense(1, activation='sigmoid'))
    
    adam = Adam()
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

In [28]:
create_model().summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 9, 9, 16)          160       
_________________________________________________________________
activation_1 (Activation)    (None, 9, 9, 16)          0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 9, 9, 16)          64        
_________________________________________________________________
dropout_1 (Dropout)          (None, 9, 9, 16)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 4, 4, 32)          4640      
_________________________________________________________________
activation_2 (Activation)    (None, 4, 4, 32)          0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 4, 4, 32)          128       
__________

In [29]:
model = KerasClassifier(create_model)

In [31]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='_data/model.best.hdf5',
                               verbose=1, save_best_only=True)

In [30]:
X_tra.shape, X_val.shape, y_tra.shape, y_val.shape

((106436, 6, 81), (11827, 6, 81), (106436, 6), (11827, 6))

In [35]:
sum(y_tra), sum(y_val)

(array([1827, 1827, 1827, 1827, 1827, 1827]),
 array([203, 203, 203, 203, 203, 203]))

In [36]:
from utils import get_random_batch
from imblearn.under_sampling import RandomUnderSampler

In [42]:
rus = RandomUnderSampler(random_state=0)
X_train_batch, y_train_batch = get_random_batch(X_tra, y_tra, rus)
X_val_batch, y_val_batch = get_random_batch(X_val, y_val, rus)

In [44]:
X_train_batch.shape, X_tra.shape

((3654, 6, 81), (106436, 6, 81))

In [43]:
history = model.fit(X_train_batch.reshape(-1, 9, 9, 1), 
                    y_train_batch.reshape(-1), 
                     validation_data=(X_val_batch.reshape(-1, 9, 9, 1),
                                      y_val_batch.reshape(-1)),
                     batch_size=128, epochs=100, verbose=1, 
                     callbacks=[checkpointer], shuffle=True)

Train on 21924 samples, validate on 2436 samples
Epoch 1/100
Epoch 00001: val_loss did not improve
Epoch 2/100
Epoch 00002: val_loss did not improve
Epoch 3/100
Epoch 00003: val_loss did not improve
Epoch 4/100
Epoch 00004: val_loss did not improve
Epoch 5/100
Epoch 00005: val_loss did not improve
Epoch 6/100
Epoch 00006: val_loss did not improve
Epoch 7/100
Epoch 00007: val_loss did not improve
Epoch 8/100
Epoch 00008: val_loss did not improve
Epoch 9/100
Epoch 00009: val_loss did not improve
Epoch 10/100
Epoch 00010: val_loss did not improve
Epoch 11/100
Epoch 00011: val_loss did not improve
Epoch 12/100
Epoch 00012: val_loss did not improve
Epoch 13/100
Epoch 00013: val_loss did not improve
Epoch 14/100
Epoch 00014: val_loss did not improve
Epoch 15/100
Epoch 00015: val_loss did not improve
Epoch 16/100
Epoch 00016: val_loss did not improve
Epoch 17/100
Epoch 00017: val_loss did not improve
Epoch 18/100
Epoch 00018: val_loss did not improve
Epoch 19/100
Epoch 00019: val_loss did not

Epoch 00032: val_loss did not improve
Epoch 33/100
Epoch 00033: val_loss did not improve
Epoch 34/100
Epoch 00034: val_loss did not improve
Epoch 35/100
Epoch 00035: val_loss did not improve
Epoch 36/100
Epoch 00036: val_loss did not improve
Epoch 37/100
Epoch 00037: val_loss did not improve
Epoch 38/100
Epoch 00038: val_loss did not improve
Epoch 39/100
Epoch 00039: val_loss did not improve
Epoch 40/100
Epoch 00040: val_loss did not improve
Epoch 41/100
Epoch 00041: val_loss did not improve
Epoch 42/100
Epoch 00042: val_loss did not improve
Epoch 43/100
Epoch 00043: val_loss did not improve
Epoch 44/100
Epoch 00044: val_loss did not improve
Epoch 45/100
Epoch 00045: val_loss did not improve
Epoch 46/100
Epoch 00046: val_loss did not improve
Epoch 47/100
Epoch 00047: val_loss did not improve
Epoch 48/100
Epoch 00048: val_loss did not improve
Epoch 49/100
Epoch 00049: val_loss did not improve
Epoch 50/100
Epoch 00050: val_loss did not improve
Epoch 51/100
Epoch 00051: val_loss did not i

Epoch 64/100
Epoch 00064: val_loss did not improve
Epoch 65/100
Epoch 00065: val_loss did not improve
Epoch 66/100
Epoch 00066: val_loss did not improve
Epoch 67/100
Epoch 00067: val_loss did not improve
Epoch 68/100
Epoch 00068: val_loss did not improve
Epoch 69/100
Epoch 00069: val_loss did not improve
Epoch 70/100
Epoch 00070: val_loss did not improve
Epoch 71/100
Epoch 00071: val_loss did not improve
Epoch 72/100
Epoch 00072: val_loss did not improve
Epoch 73/100
Epoch 00073: val_loss did not improve
Epoch 74/100
Epoch 00074: val_loss did not improve
Epoch 75/100
Epoch 00075: val_loss did not improve
Epoch 76/100
Epoch 00076: val_loss did not improve
Epoch 77/100
Epoch 00077: val_loss did not improve
Epoch 78/100
Epoch 00078: val_loss did not improve
Epoch 79/100
Epoch 00079: val_loss did not improve
Epoch 80/100
Epoch 00080: val_loss did not improve
Epoch 81/100
Epoch 00081: val_loss did not improve
Epoch 82/100
Epoch 00082: val_loss did not improve
Epoch 83/100
Epoch 00083: val_l

Epoch 96/100
Epoch 00096: val_loss did not improve
Epoch 97/100
Epoch 00097: val_loss did not improve
Epoch 98/100
Epoch 00098: val_loss did not improve
Epoch 99/100
Epoch 00099: val_loss did not improve
Epoch 100/100
Epoch 00100: val_loss did not improve


In [54]:
y_pred = model.predict(X_val_batch.reshape(-1, 9, 9, 1))
y_pred_proba = model.predict_proba(X_val_batch.reshape(-1, 9, 9, 1))

In [55]:
from utils import evaluate_metrics

In [58]:
evaluate_metrics(y_val_batch.reshape(-1), y_pred, y_pred_proba[:,1])

Confusion matrix:
[[ 723  495]
 [  93 1125]]
  Model Fold   ROC-AUC    G-mean  F1-Score       TPR       TNR  Accuracy  \
0             0.837806  0.740454  0.792812  0.923645  0.593596  0.758621   

   Precision    Recall   Logloss Description  
0   0.694444  0.923645  0.490929              


Unnamed: 0,Model,Fold,ROC-AUC,G-mean,F1-Score,TPR,TNR,Accuracy,Precision,Recall,Logloss,Description
0,,,0.837806,0.740454,0.792812,0.923645,0.593596,0.758621,0.694444,0.923645,0.490929,


In [59]:
X_test.shape, y_test.shape

((10667, 6, 81), (10667, 6))

In [61]:
y_pred = model.predict(X_test.reshape(-1, 9, 9, 1))
y_pred_proba = model.predict_proba(X_test.reshape(-1, 9, 9, 1))

In [63]:
evaluate_metrics(y_test.reshape(-1), y_pred, y_pred_proba[:,1])

Confusion matrix:
[[16535 41869]
 [ 1135  4463]]
  Model Fold   ROC-AUC    G-mean  F1-Score       TPR       TNR  Accuracy  \
0             0.633002  0.475092  0.171885  0.797249  0.283114  0.328083   

   Precision    Recall   Logloss Description  
0   0.096327  0.797249  0.905357              


Unnamed: 0,Model,Fold,ROC-AUC,G-mean,F1-Score,TPR,TNR,Accuracy,Precision,Recall,Logloss,Description
0,,,0.633002,0.475092,0.171885,0.797249,0.283114,0.328083,0.096327,0.797249,0.905357,


In [64]:
test_fold

'MYC.csv'

## LEF1

In [67]:
from utils import load_fold
train, test, test_fold = load_fold('Raw', 1)

Test fold: LEF1.csv
Train fold: CTNNB1.csv
Train fold: Jnk.csv
Train fold: Erk.csv
Train fold: IRF4.csv
Train fold: MYC.csv
Train fold: Ikk2.csv


In [68]:
from utils import outer_product
from utils import group_data
X_train, y_train = group_data(outer_product(train))
X_test, y_test = group_data(outer_product(test))

In [69]:
from utils import sklearn_train_test_split
X_tra, X_val, y_tra, y_val = sklearn_train_test_split(
                X_train, y_train, 0.1, random_state=42,
                shuffle=True, stratify=y_train)

In [70]:
model = KerasClassifier(create_model)

In [71]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='_data/model.best.hdf5',
                               verbose=1, save_best_only=True)

In [72]:
rus = RandomUnderSampler(random_state=0)
X_train_batch, y_train_batch = get_random_batch(X_tra, y_tra, rus)
X_val_batch, y_val_batch = get_random_batch(X_val, y_val, rus)

In [73]:
X_train_batch.shape, X_tra.shape

((5294, 6, 81), (105826, 6, 81))

In [75]:
history = model.fit(X_train_batch.reshape(-1, 9, 9, 1), 
                    y_train_batch.reshape(-1), 
                     validation_data=(X_val_batch.reshape(-1, 9, 9, 1),
                                      y_val_batch.reshape(-1)),
                     batch_size=128, epochs=100, verbose=1, 
                     callbacks=[checkpointer], shuffle=True)

Train on 31764 samples, validate on 3528 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.70187, saving model to _data/model.best.hdf5
Epoch 2/100
Epoch 00002: val_loss improved from 0.70187 to 0.69145, saving model to _data/model.best.hdf5
Epoch 3/100
Epoch 00003: val_loss improved from 0.69145 to 0.66952, saving model to _data/model.best.hdf5
Epoch 4/100
Epoch 00004: val_loss improved from 0.66952 to 0.63203, saving model to _data/model.best.hdf5
Epoch 5/100
Epoch 00005: val_loss improved from 0.63203 to 0.61211, saving model to _data/model.best.hdf5
Epoch 6/100
Epoch 00006: val_loss improved from 0.61211 to 0.57782, saving model to _data/model.best.hdf5
Epoch 7/100
Epoch 00007: val_loss improved from 0.57782 to 0.57282, saving model to _data/model.best.hdf5
Epoch 8/100
Epoch 00008: val_loss improved from 0.57282 to 0.57145, saving model to _data/model.best.hdf5
Epoch 9/100
Epoch 00009: val_loss improved from 0.57145 to 0.55621, saving model to _data/model.best.hdf5
E

Epoch 28/100
Epoch 00028: val_loss did not improve
Epoch 29/100
Epoch 00029: val_loss improved from 0.47947 to 0.47504, saving model to _data/model.best.hdf5
Epoch 30/100
Epoch 00030: val_loss improved from 0.47504 to 0.47380, saving model to _data/model.best.hdf5
Epoch 31/100
Epoch 00031: val_loss did not improve
Epoch 32/100
Epoch 00032: val_loss improved from 0.47380 to 0.47213, saving model to _data/model.best.hdf5
Epoch 33/100
Epoch 00033: val_loss did not improve
Epoch 34/100
Epoch 00034: val_loss did not improve
Epoch 35/100
Epoch 00035: val_loss did not improve
Epoch 36/100
Epoch 00036: val_loss did not improve
Epoch 37/100
Epoch 00037: val_loss improved from 0.47213 to 0.47051, saving model to _data/model.best.hdf5
Epoch 38/100
Epoch 00038: val_loss did not improve
Epoch 39/100
Epoch 00039: val_loss did not improve
Epoch 40/100
Epoch 00040: val_loss did not improve
Epoch 41/100
Epoch 00041: val_loss did not improve
Epoch 42/100
Epoch 00042: val_loss improved from 0.47051 to 0.

Epoch 58/100
Epoch 00058: val_loss did not improve
Epoch 59/100
Epoch 00059: val_loss did not improve
Epoch 60/100
Epoch 00060: val_loss did not improve
Epoch 61/100
Epoch 00061: val_loss did not improve
Epoch 62/100
Epoch 00062: val_loss did not improve
Epoch 63/100
Epoch 00063: val_loss did not improve
Epoch 64/100
Epoch 00064: val_loss improved from 0.46782 to 0.46707, saving model to _data/model.best.hdf5
Epoch 65/100
Epoch 00065: val_loss did not improve
Epoch 66/100
Epoch 00066: val_loss improved from 0.46707 to 0.46680, saving model to _data/model.best.hdf5
Epoch 67/100
Epoch 00067: val_loss did not improve
Epoch 68/100
Epoch 00068: val_loss did not improve
Epoch 69/100
Epoch 00069: val_loss did not improve
Epoch 70/100
Epoch 00070: val_loss improved from 0.46680 to 0.46651, saving model to _data/model.best.hdf5
Epoch 71/100
Epoch 00071: val_loss improved from 0.46651 to 0.46613, saving model to _data/model.best.hdf5
Epoch 72/100
Epoch 00072: val_loss did not improve
Epoch 73/10

Epoch 00088: val_loss did not improve
Epoch 89/100
Epoch 00089: val_loss did not improve
Epoch 90/100
Epoch 00090: val_loss did not improve
Epoch 91/100
Epoch 00091: val_loss did not improve
Epoch 92/100
Epoch 00092: val_loss did not improve
Epoch 93/100
Epoch 00093: val_loss did not improve
Epoch 94/100
Epoch 00094: val_loss did not improve
Epoch 95/100
Epoch 00095: val_loss improved from 0.46538 to 0.46442, saving model to _data/model.best.hdf5
Epoch 96/100
Epoch 00096: val_loss did not improve
Epoch 97/100
Epoch 00097: val_loss did not improve
Epoch 98/100
Epoch 00098: val_loss did not improve
Epoch 99/100
Epoch 00099: val_loss did not improve
Epoch 100/100
Epoch 00100: val_loss did not improve


In [76]:
y_pred = model.predict(X_val_batch.reshape(-1, 9, 9, 1))
y_pred_proba = model.predict_proba(X_val_batch.reshape(-1, 9, 9, 1))
evaluate_metrics(y_val_batch.reshape(-1), y_pred, y_pred_proba[:,1])

Confusion matrix:
[[1130  634]
 [ 169 1595]]
  Model Fold   ROC-AUC    G-mean  F1-Score       TPR      TNR  Accuracy  \
0             0.856242  0.761064  0.798898  0.904195  0.64059  0.772392   

   Precision    Recall   Logloss Description  
0   0.715568  0.904195  0.464701              


Unnamed: 0,Model,Fold,ROC-AUC,G-mean,F1-Score,TPR,TNR,Accuracy,Precision,Recall,Logloss,Description
0,,,0.856242,0.761064,0.798898,0.904195,0.64059,0.772392,0.715568,0.904195,0.464701,


In [77]:
y_pred = model.predict(X_test.reshape(-1, 9, 9, 1))
y_pred_proba = model.predict_proba(X_test.reshape(-1, 9, 9, 1))
evaluate_metrics(y_test.reshape(-1), y_pred, y_pred_proba[:,1])

Confusion matrix:
[[36872 31066]
 [   33    99]]
  Model Fold  ROC-AUC    G-mean  F1-Score   TPR      TNR  Accuracy  Precision  \
0             0.68041  0.638003  0.006326  0.75  0.54273  0.543132   0.003177   

   Recall   Logloss Description  
0    0.75  0.842144              


Unnamed: 0,Model,Fold,ROC-AUC,G-mean,F1-Score,TPR,TNR,Accuracy,Precision,Recall,Logloss,Description
0,,,0.68041,0.638003,0.006326,0.75,0.54273,0.543132,0.003177,0.75,0.842144,


In [None]:
test_fold