In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics

import random
random.seed(7)


def generator(X, Y, batch_size=32, train=True):
    while True:
        for offset in range(0, len(X), batch_size):
            X_batch = np.stack(X[offset:offset+batch_size], axis=0)
            Y_batch = np.stack(Y[offset:offset+batch_size], axis=0)

            Y_batch_ = np.empty((Y_batch.shape[0], 2,2,20))
            for m in range(Y_batch.shape[0]):
                for i in range(20):
                    Y_batch_[m, :,:,i] = [[np.sum(Y_batch[m, :10, :10, i]), np.sum(Y_batch[m, :10, 10:, i])], [np.sum(Y_batch[m, 10:, :10, i]), np.sum(Y_batch[m, 10:, 10:, i])]]
            Y_batch_[Y_batch_ > 0] = 1

            if train:
                Y_f = np.array([Y_batch_[e].flatten() for e in range(Y_batch_.shape[0])])
                yield (X_batch, Y_f)
            else:
                yield X_batch

structure_ids = []
for line in open('./structures lists/structures human.txt', 'r'):
    line = line.strip('\n')
    structure_ids.append(line)
# for line in open('./structures lists/structures ecoli.txt', 'r'):
#     line = line.strip('\n')
#     structure_ids.append(line)
structure_ids.remove('4pkd')
structure_ids.remove('1a9n')
structure_ids.remove('2adc')
random.shuffle(structure_ids)
print(len(structure_ids))

X_train = []
X_test = []
Y_train = []
Y_test = []
num_aa_train = 0
num_aa_test = 0
num_train = int(len(structure_ids)*0.7)
for i, structure_id in enumerate(structure_ids):
    protein = np.load('../data/voxelized data 20x20x20/' + structure_id + '_protein.npy', mmap_mode='r')
    rna = np.load('../data/voxelized data 20x20x20/' + structure_id + '_rna_3D.npy', mmap_mode='r')
    na = 0
    pos = 0
    while (np.sum(rna[na]) > 0) and (na < len(rna)-1):
        pos +=1
        na +=1
    

    if i <= num_train:
        if pos > len(rna)/2:
            X_train.extend(protein[:, :, :, :, :3])
            Y_train.extend(rna)
            num_aa_train +=len(rna)
        else:
            X_train.extend(protein[:pos, :, :, :, :3])
            X_train.extend(protein[-pos:, :, :, :, :3])
            Y_train.extend(rna[:pos])
            Y_train.extend(rna[-pos:])
            num_aa_train +=2*pos
    else:
        if pos > len(rna)/2:
            X_test.extend(protein[:, :, :, :, :3])
            Y_test.extend(rna)
            num_aa_test +=len(rna)
        else:
            X_test.extend(protein[:pos, :, :, :, :3])
            X_test.extend(protein[-pos:, :, :, :, :3])
            Y_test.extend(rna[:pos])
            Y_test.extend(rna[-pos:])
            num_aa_test +=2*pos

Y_test = np.stack(Y_test, axis=0)
Y_test_ = np.empty((len(Y_test), 2,2,20))
for m in range(len(Y_test)):
    for i in range(20):
        Y_test_[m, :,:,i] = [[np.sum(Y_test[m, :10, :10, i]), np.sum(Y_test[m, :10, 10:, i])], [np.sum(Y_test[m, 10:, :10, i]), np.sum(Y_test[m, 10:, 10:, i])]]
Y_test_[Y_test_ > 0] = 1
   
n_steps_train = int(num_aa_train/400) 
n_steps_test = int(num_aa_test/400)

print(num_aa_train, num_aa_test)

generator_train = generator(X_train, Y_train, 400, True)
generator_test = generator(X_test, Y_test, 400, False)

ins = tf.keras.layers.Input((20, 20, 20, 3))
con1 = tf.keras.layers.Conv3D(filters=64, kernel_size=(3, 3, 3), padding='same', activation='relu')(ins)
con2 = tf.keras.layers.Conv3D(filters=32, kernel_size=(3, 3, 3), padding='same', activation='relu')(con1)
con3 = tf.keras.layers.Conv3D(filters=32, kernel_size=(3, 3, 3), padding='same', activation='relu')(con2)
maxp1 = tf.keras.layers.MaxPool3D(pool_size=(2, 2, 2))(con3)
con4 = tf.keras.layers.Conv3D(filters=32, kernel_size=(3, 3, 3), padding='same', activation='relu')(maxp1)
con5 = tf.keras.layers.Conv3D(filters=16, kernel_size=(3, 3, 3), padding='same', activation='relu')(con4)
con6 = tf.keras.layers.Conv3D(filters=16, kernel_size=(3, 3, 3), padding='same', activation='relu')(con5)
maxp2 = tf.keras.layers.MaxPool3D(pool_size=(2, 2, 2))(con6)
con7 = tf.keras.layers.Conv3D(filters=16, kernel_size=(3, 3, 3), padding='same', activation='relu')(maxp2)
con8 = tf.keras.layers.Conv3D(filters=8, kernel_size=(3, 3, 3), padding='same', activation='relu')(con7)
con9 = tf.keras.layers.Conv3D(filters=4, kernel_size=(3, 3, 3), padding='same', activation='relu')(con8)
maxp3 = tf.keras.layers.MaxPool3D(pool_size=(2, 2, 2))(con9)
batch = tf.keras.layers.BatchNormalization()(maxp3)
flat = tf.keras.layers.Flatten()(batch)
dens2 = tf.keras.layers.Dense(units=256, activation='relu')(flat)
drop2 = tf.keras.layers.Dropout(0.6)(dens2)
outs = tf.keras.layers.Dense(units=80, activation='sigmoid')(drop2)
model = tf.keras.models.Model(inputs=ins, outputs=outs)
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=0.00001), metrics=['accuracy'])

model.summary()

# checkpoint
# filepath="weights_best.hdf5"
# checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
# callbacks_list = [checkpoint]

# model.fit(X_train, Y_train_f, validation_split=0.33, epochs=1, batch_size=200, callbacks=callbacks_list, verbose=0)
model.fit_generator(generator_train, steps_per_epoch=n_steps_train, epochs=100, callbacks=None, verbose=1, max_queue_size=2)

# model_best = model
# model_best.load_weights("weights_best.hdf5")
# print(model.evaluate(X_test, Y_test, verbose=0, batch_size=100))
# model_best.save('model_cnn_15_2.h5')
# Y_pred = model_best.predict(X_test, batch_size=200)
Y_pred = model.predict_generator(generator_test, steps=n_steps_test)
print(Y_pred.shape)
Y_pred_ = np.array([Y_pred[i].reshape((2,2,20)) for i in range(Y_pred.shape[0])])

#CNN
Y_pred_[Y_pred_ >= 0.5] = 1
Y_pred_[Y_pred_ < 0.5] = 0

print(Y_pred_.shape)
print(Y_test_.shape)
Y_test_ = Y_test_[:Y_pred_.shape[0]]

print('CNN: \n')
for i in range(20):
    confusion_matrix = [sklearn.metrics.confusion_matrix(Y_test_[:,l , c, i], Y_pred_[:,l , c, i]) for l in range(2) for c in range(2)]
    accuracy = [np.sum(np.trace(cm))/np.sum(cm) for cm in confusion_matrix]
    auc = [sklearn.metrics.roc_auc_score(Y_test_[:,l , c, i], Y_pred_[:,l , c, i]) for l in range(2) for c in range(2)]

    print(f'level {i}')
    for q in range(len(confusion_matrix)):
        print(confusion_matrix[q], np.round(accuracy[q], 2), np.round(auc[q], 2))

# baseline model
# predict all zeros; at least 50% correct predictions because there are 1/2 of negative examples
Y_pred_base = np.zeros(Y_test_.shape)

# po = np.sum(Y_train, axis=0)/Y_train.shape[0]
# po[po >= 0.5] = 1
# po[po < 0.5] = 0
# Y_pred_base = np.tile(po, (Y_test.shape[0],1))

Y_pred_base[Y_pred_base >= 0.5] = 1
Y_pred_base[Y_pred_base < 0.5] = 0
print(Y_pred_base.shape)
print(f'\n BASELINE MODEL: \n')
for i in range(20):
    confusion_matrix_base = [sklearn.metrics.confusion_matrix(Y_test_[:,l, c, i], Y_pred_base[:,l, c, i]) for l in range(2) for c in range(2)]
    accuracy_base = [np.sum(np.trace(cm))/np.sum(cm) for cm in confusion_matrix_base]
    auc_base = [sklearn.metrics.roc_auc_score(Y_test_[:,l, c, i], Y_pred_base[:,l, c, i]) for l in range(2) for c in range(2)]
    
    print(f'level {i}')
    for q in range(len(confusion_matrix_base)):
        print(confusion_matrix_base[q], np.round(accuracy_base[q], 2), np.round(auc_base[q], 2))

#random model
Y_pred_random = np.random.random(Y_test_.shape)
Y_pred_random[Y_pred_random >= 0.5] = 1
Y_pred_random[Y_pred_random < 0.5] = 0

print(f'\n RANDOM MODEL: \n')
for i in range(20):
    confusion_matrix_random = [sklearn.metrics.confusion_matrix(Y_test_[:,l, c, i], Y_pred_random[:,l, c, i]) for l in range(2) for c in range(2)]
    accuracy_random = [np.sum(np.trace(cm))/np.sum(cm) for cm in confusion_matrix_random]
    auc_random = [sklearn.metrics.roc_auc_score(Y_test_[:,l, c, i], Y_pred_random[:,l, c, i]) for l in range(2) for c in range(2)]

    print(f'level {i}')
    for q in range(len(confusion_matrix_random)):
        print(confusion_matrix_random[q], np.round(accuracy_random[q], 2), np.round(auc_random[q], 2))

230
83862 44057
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 20, 20, 20, 3)     0         
_________________________________________________________________
conv3d_9 (Conv3D)            (None, 20, 20, 20, 64)    5248      
_________________________________________________________________
conv3d_10 (Conv3D)           (None, 20, 20, 20, 32)    55328     
_________________________________________________________________
conv3d_11 (Conv3D)           (None, 20, 20, 20, 32)    27680     
_________________________________________________________________
max_pooling3d_3 (MaxPooling3 (None, 10, 10, 10, 32)    0         
_________________________________________________________________
conv3d_12 (Conv3D)           (None, 10, 10, 10, 32)    27680     
_________________________________________________________________
conv3d_13 (Conv3D)           (None, 10, 10, 10, 16)    13840

Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
(44000, 80)
(44000, 2, 2, 20)
(44057, 2, 2, 20)
CNN: 

level 0
[[40273     0]
 [ 3727     0]] 0.92 0.5
[[39956     0]
 [ 4044     0]] 0.91 0.5
[[40296     0]
 [ 3704     0]] 0.92 0.5
[[40042     0]
 [ 3958     0]] 0.91 0.5
level 1
[[40006     0]
 [ 3994     0]] 0.91 0.5
[[39813     0]
 [ 4187     0]] 0.9 0.5
[[40134     0]
 [ 3866     0]] 0.91 0.5
[[39935     0]
 [ 4065     0]] 0.91 0.5
level 2
[[39877     0]
 

level 1
[[40006     0]
 [ 3994     0]] 0.91 0.5
[[39813     0]
 [ 4187     0]] 0.9 0.5
[[40134     0]
 [ 3866     0]] 0.91 0.5
[[39935     0]
 [ 4065     0]] 0.91 0.5
level 2
[[39877     0]
 [ 4123     0]] 0.91 0.5
[[39762     0]
 [ 4238     0]] 0.9 0.5
[[40074     0]
 [ 3926     0]] 0.91 0.5
[[39866     0]
 [ 4134     0]] 0.91 0.5
level 3
[[39721     0]
 [ 4279     0]] 0.9 0.5
[[39649     0]
 [ 4351     0]] 0.9 0.5
[[40036     0]
 [ 3964     0]] 0.91 0.5
[[39787     0]
 [ 4213     0]] 0.9 0.5
level 4
[[39577     0]
 [ 4423     0]] 0.9 0.5
[[39518     0]
 [ 4482     0]] 0.9 0.5
[[39916     0]
 [ 4084     0]] 0.91 0.5
[[39720     0]
 [ 4280     0]] 0.9 0.5
level 5
[[39537     0]
 [ 4463     0]] 0.9 0.5
[[39380     0]
 [ 4620     0]] 0.9 0.5
[[39719     0]
 [ 4281     0]] 0.9 0.5
[[39626     0]
 [ 4374     0]] 0.9 0.5
level 6
[[39312     0]
 [ 4688     0]] 0.89 0.5
[[39336     0]
 [ 4664     0]] 0.89 0.5
[[39603     0]
 [ 4397     0]] 0.9 0.5
[[39499     0]
 [ 4501     0]] 0.9 0.5
level 