In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics

import random
random.seed(7)


structure_ids = []
for line in open('./structures lists/structures human.txt', 'r'):
    line = line.strip('\n')
    structure_ids.append(line)
# for line in open('./structures lists/structures ecoli.txt', 'r'):
#     line = line.strip('\n')
#     structure_ids.append(line)
structure_ids.remove('4pkd')
structure_ids.remove('1a9n')
structure_ids.remove('2adc')
random.shuffle(structure_ids)
print(len(structure_ids))


X_train = []
X_test = []
Y_train = []
Y_test = []
num_train = int(len(structure_ids)*0.7)
for i, structure_id in enumerate(structure_ids):
    protein = np.load('../data/voxelized data 20x20x20/' + structure_id + '_protein.npy', mmap_mode='r')
    rna = np.load('../data/voxelized data 20x20x20/' + structure_id + '_rna_3D.npy', mmap_mode='r')
    if i <= num_train:
        X_train.append(protein[:20])
        X_train.append(protein[-20:])
        Y_train.append(rna[:20])
        Y_train.append(rna[-20:])
    else:
        X_test.append(protein[:20])
        X_test.append(protein[-20:])
        Y_test.append(rna[:20])
        Y_test.append(rna[-20:])

X_train = np.concatenate(X_train)
Y_train = np.concatenate(Y_train)

X_test = np.concatenate(X_test)
Y_test = np.concatenate(Y_test)

Y_train_ = np.empty((Y_train.shape[0], 2,2,20))
for m in range(Y_train.shape[0]):
    for i in range(20):
        Y_train_[m, :,:,i] = [[np.sum(Y_train[m, :10, :10, i]), np.sum(Y_train[m, :10, 10:, i])], [np.sum(Y_train[m, 10:, :10, i]), np.sum(Y_train[m, 10:, 10:, i])]]
Y_train_[Y_train_ > 0] = 1

Y_test_ = np.empty((Y_test.shape[0], 2,2,20))
for m in range(Y_test.shape[0]):
    for i in range(20):
        Y_test_[m, :,:,i] = [[np.sum(Y_test[m, :10, :10, i]), np.sum(Y_test[m, :10, 10:, i])], [np.sum(Y_test[m, 10:, :10, i]), np.sum(Y_test[m, 10:, 10:, i])]]
Y_test_[Y_test_ > 0] = 1

Y_train_f = np.array([Y_train_[e].flatten() for e in range(Y_train_.shape[0])])
#Y_test_f = np.array([Y_test_[e].flatten() for e in range(Y_test_.shape[0])])

print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)
print(Y_train_.shape, Y_test_.shape)
print(Y_train_f.shape)

ins = tf.keras.layers.Input((20, 20, 20, 4))
con1 = tf.keras.layers.Conv3D(filters=64, kernel_size=(3, 3, 3), padding='same', activation='relu')(ins)
con2 = tf.keras.layers.Conv3D(filters=32, kernel_size=(3, 3, 3), padding='same', activation='relu')(con1)
con3 = tf.keras.layers.Conv3D(filters=32, kernel_size=(3, 3, 3), padding='same', activation='relu')(con2)
maxp1 = tf.keras.layers.MaxPool3D(pool_size=(2, 2, 2))(con3)
con4 = tf.keras.layers.Conv3D(filters=32, kernel_size=(3, 3, 3), padding='same', activation='relu')(maxp1)
con5 = tf.keras.layers.Conv3D(filters=16, kernel_size=(3, 3, 3), padding='same', activation='relu')(con4)
con6 = tf.keras.layers.Conv3D(filters=16, kernel_size=(3, 3, 3), padding='same', activation='relu')(con5)
maxp2 = tf.keras.layers.MaxPool3D(pool_size=(2, 2, 2))(con6)
con7 = tf.keras.layers.Conv3D(filters=16, kernel_size=(3, 3, 3), padding='same', activation='relu')(maxp2)
con8 = tf.keras.layers.Conv3D(filters=8, kernel_size=(3, 3, 3), padding='same', activation='relu')(con7)
con9 = tf.keras.layers.Conv3D(filters=4, kernel_size=(3, 3, 3), padding='same', activation='relu')(con8)
maxp3 = tf.keras.layers.MaxPool3D(pool_size=(2, 2, 2))(con9)
batch = tf.keras.layers.BatchNormalization()(maxp3)
flat = tf.keras.layers.Flatten()(batch)
dens2 = tf.keras.layers.Dense(units=256, activation='relu')(flat)
drop2 = tf.keras.layers.Dropout(0.6)(dens2)
outs = tf.keras.layers.Dense(units=80, activation='sigmoid')(drop2)
model = tf.keras.models.Model(inputs=ins, outputs=outs)
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=0.00001), metrics=['accuracy'])

model.summary()

# checkpoint
filepath="weights_best.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.fit(X_train, Y_train_f, validation_split=0.33, epochs=500, batch_size=200, callbacks=callbacks_list, verbose=0)

model_best = model
model_best.load_weights("weights_best.hdf5")
# print(model.evaluate(X_test, Y_test, verbose=0, batch_size=100))
model_best.save('model_cnn_15_2.h5')
Y_pred = model_best.predict(X_test, batch_size=200)

Y_pred_ = np.array([Y_pred[i].reshape((2,2,20)) for i in range(Y_pred.shape[0])])

#CNN
Y_pred_[Y_pred_ >= 0.5] = 1
Y_pred_[Y_pred_ < 0.5] = 0

print(Y_pred_.shape)

print('CNN: \n')
for i in range(20):
    confusion_matrix = [sklearn.metrics.confusion_matrix(Y_test_[:,l , c, i], Y_pred_[:,l , c, i]) for l in range(2) for c in range(2)]
    accuracy = [np.sum(np.trace(cm))/np.sum(cm) for cm in confusion_matrix]
    auc = [sklearn.metrics.roc_auc_score(Y_test_[:,l , c, i], Y_pred_[:,l , c, i]) for l in range(2) for c in range(2)]

    print(f'level {i}')
    for q in range(len(confusion_matrix)):
        print(confusion_matrix[q], np.round(accuracy[q], 2), np.round(auc[q], 2))

# baseline model
# predict all zeros; at least 50% correct predictions because there are 1/2 of negative examples
Y_pred_base = np.zeros(Y_test_.shape)

# po = np.sum(Y_train, axis=0)/Y_train.shape[0]
# po[po >= 0.5] = 1
# po[po < 0.5] = 0
# Y_pred_base = np.tile(po, (Y_test.shape[0],1))

Y_pred_base[Y_pred_base >= 0.5] = 1
Y_pred_base[Y_pred_base < 0.5] = 0
print(Y_pred_base.shape)
print(f'\n BASELINE MODEL: \n')
for i in range(20):
    confusion_matrix_base = [sklearn.metrics.confusion_matrix(Y_test_[:,l, c, i], Y_pred_base[:,l, c, i]) for l in range(2) for c in range(2)]
    accuracy_base = [np.sum(np.trace(cm))/np.sum(cm) for cm in confusion_matrix_base]
    auc_base = [sklearn.metrics.roc_auc_score(Y_test_[:,l, c, i], Y_pred_base[:,l, c, i]) for l in range(2) for c in range(2)]
    
    print(f'level {i}')
    for q in range(len(confusion_matrix_base)):
        print(confusion_matrix_base[q], np.round(accuracy_base[q], 2), np.round(auc_base[q], 2))

#random model
Y_pred_random = np.random.random(Y_test_.shape)
Y_pred_random[Y_pred_random >= 0.5] = 1
Y_pred_random[Y_pred_random < 0.5] = 0

print(f'\n RANDOM MODEL: \n')
for i in range(20):
    confusion_matrix_random = [sklearn.metrics.confusion_matrix(Y_test_[:,l, c, i], Y_pred_random[:,l, c, i]) for l in range(2) for c in range(2)]
    accuracy_random = [np.sum(np.trace(cm))/np.sum(cm) for cm in confusion_matrix_random]
    auc_random = [sklearn.metrics.roc_auc_score(Y_test_[:,l, c, i], Y_pred_random[:,l, c, i]) for l in range(2) for c in range(2)]

    print(f'level {i}')
    for q in range(len(confusion_matrix_random)):
        print(confusion_matrix_random[q], np.round(accuracy_random[q], 2), np.round(auc_random[q], 2))

230
(6480, 20, 20, 20, 4) (2720, 20, 20, 20, 4)
(6480, 20, 20, 20) (2720, 20, 20, 20)
(6480, 2, 2, 20) (2720, 2, 2, 20)
(6480, 80)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 20, 20, 20, 4)     0         
_________________________________________________________________
conv3d (Conv3D)              (None, 20, 20, 20, 64)    6976      
_________________________________________________________________
conv3d_1 (Conv3D)            (None, 20, 20, 20, 32)    55328     
_________________________________________________________________
conv3d_2 (Conv3D)            (None, 20, 20, 20, 32)    27680     
_________________________________________________________________
max_pooling3d (MaxPooling3D) (None, 10, 10, 10, 32)    0         
_________________________________________________________________
conv3d_3 (Conv3D)            (None, 10, 10, 10, 32)    27680     
___________

level 15
[[2069    0]
 [ 651    0]] 0.76 0.5
[[2045    0]
 [ 675    0]] 0.75 0.5
[[2098    0]
 [ 622    0]] 0.77 0.5
[[2114    0]
 [ 606    0]] 0.78 0.5
level 16
[[2097    0]
 [ 623    0]] 0.77 0.5
[[2071    0]
 [ 649    0]] 0.76 0.5
[[2133    0]
 [ 587    0]] 0.78 0.5
[[2128    0]
 [ 592    0]] 0.78 0.5
level 17
[[2096    0]
 [ 624    0]] 0.77 0.5
[[2106    0]
 [ 614    0]] 0.77 0.5
[[2150    0]
 [ 570    0]] 0.79 0.5
[[2145    0]
 [ 575    0]] 0.79 0.5
level 18
[[2115    0]
 [ 605    0]] 0.78 0.5
[[2115    0]
 [ 605    0]] 0.78 0.5
[[2195    0]
 [ 525    0]] 0.81 0.5
[[2154    0]
 [ 566    0]] 0.79 0.5
level 19
[[2154    0]
 [ 566    0]] 0.79 0.5
[[2141    0]
 [ 579    0]] 0.79 0.5
[[2223    0]
 [ 497    0]] 0.82 0.5
[[2202    0]
 [ 518    0]] 0.81 0.5

 RANDOM MODEL: 

level 0
[[1224 1156]
 [ 157  183]] 0.52 0.53
[[1129 1251]
 [ 174  166]] 0.48 0.48
[[1180 1225]
 [ 157  158]] 0.49 0.5
[[1263 1185]
 [ 136  136]] 0.51 0.51
level 1
[[1154 1215]
 [ 171  180]] 0.49 0.5
[[1155 1189]
 [ 19