In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics

import random
random.seed(7)

structure_ids = []
for line in open('./structures lists/structures human.txt', 'r'):
    line = line.strip('\n')
    structure_ids.append(line)
# for line in open('./structures lists/structures ecoli.txt', 'r'):
#     line = line.strip('\n')
#     structure_ids.append(line)
structure_ids.remove('1a9n')
structure_ids.remove('2adc')
random.shuffle(structure_ids)
print(len(structure_ids))

# X = []
# Y = []
# for structure_id in structure_ids:
#     protein = np.load('../data/voxelized data 10x10x10/' + structure_id + '_protein.npy', mmap_mode='r')
#     rna = np.load('../data/voxelized data 10x10x10/' + structure_id + '_rna.npy', mmap_mode='r')
#     X.append(protein[:20])
#     X.append(protein[-20:])
#     # rna = list(map(sum, rna))
#     Y.append(rna[:20])
#     Y.append(rna[-20:])

# X = np.concatenate(X)
# Y = np.concatenate(Y)
# Y[Y > 0] = 1

# num_train = int(X.shape[0]*0.7)
# X_train = X[:num_train]
# Y_train = Y[:num_train]
# X_test = X[num_train:]
# Y_test = Y[num_train:]
# print(X_train.shape, X_test.shape)

X_train = []
X_test = []
Y_train = []
Y_test = []
num_train = int(len(structure_ids)*0.7)
for i, structure_id in enumerate(structure_ids):
    protein = np.load('../data/voxelized data 5x5x5/' + structure_id + '_protein.npy', mmap_mode='r')
    rna = np.load('../data/voxelized data 5x5x5/' + structure_id + '_rna.npy', mmap_mode='r')
    if i <= num_train:
        X_train.append(protein[:20])
        X_train.append(protein[-20:])
        Y_train.append(rna[:20])
        Y_train.append(rna[-20:])
    else:
        X_test.append(protein[:20])
        X_test.append(protein[-20:])
        Y_test.append(rna[:20])
        Y_test.append(rna[-20:])

X_train = np.concatenate(X_train)
Y_train = np.concatenate(Y_train)
Y_train[Y_train > 0] = 1

X_test = np.concatenate(X_test)
Y_test = np.concatenate(Y_test)
Y_test[Y_test > 0] = 1

print(X_train.shape, X_test.shape)


ins = tf.keras.layers.Input((5, 5, 5, 3))
con1 = tf.keras.layers.Conv3D(filters=64, kernel_size=(3, 3, 3), padding='same', activation='relu')(ins)
con2 = tf.keras.layers.Conv3D(filters=32, kernel_size=(3, 3, 3), padding='same', activation='relu')(con1)
con3 = tf.keras.layers.Conv3D(filters=16, kernel_size=(3, 3, 3), padding='same', activation='relu')(con2)
maxp1 = tf.keras.layers.MaxPool3D(pool_size=(2, 2, 2))(con3)
con4 = tf.keras.layers.Conv3D(filters=16, kernel_size=(3, 3, 3), padding='same', activation='relu')(maxp1)
con5 = tf.keras.layers.Conv3D(filters=8, kernel_size=(3, 3, 3), padding='same', activation='relu')(con4)
con6 = tf.keras.layers.Conv3D(filters=4, kernel_size=(3, 3, 3), padding='same', activation='relu')(con5)
maxp2 = tf.keras.layers.MaxPool3D(pool_size=(2, 2, 2))(con6)
batch = tf.keras.layers.BatchNormalization()(maxp2)
flat = tf.keras.layers.Flatten()(batch)
dens2 = tf.keras.layers.Dense(units=1024, activation='relu')(flat)
drop2 = tf.keras.layers.Dropout(0.6)(dens2)
outs = tf.keras.layers.Dense(units=5, activation='sigmoid')(drop2)
model = tf.keras.models.Model(inputs=ins, outputs=outs)
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=0.00001), metrics=['accuracy'])

model.summary()

# checkpoint
filepath="weights.best.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.fit(X_train, Y_train, validation_split=0.33, epochs=500, batch_size=200, callbacks=callbacks_list, verbose=0)

model_best = model
model_best.load_weights("weights.best.hdf5")
# print(model.evaluate(X_test, Y_test, verbose=0, batch_size=100))
Y_pred = model_best.predict(X_test, batch_size=200)

#CNN
Y_pred[Y_pred >= 0.5] = 1
Y_pred[Y_pred < 0.5] = 0

confusion_matrix = [sklearn.metrics.confusion_matrix(Y_test[:,i], Y_pred[:,i]) for i in range(5)]
accuracy = [np.sum(np.trace(cm))/np.sum(cm) for cm in confusion_matrix]
auc = [sklearn.metrics.roc_auc_score(Y_test[:,i], Y_pred[:,i]) for i in range(5)]

print('CNN: \n')
for i in range(len(confusion_matrix)):
    print(confusion_matrix[i], np.round(accuracy[i], 2), np.round(auc[i], 2))

# baseline model
# predict all zeros; at least 50% correct predictions because there are 1/2 of negative examples
# (Yi_true = [0, 0, 0, 0, 0])
# Y_pred_base = np.zeros(Y_test.shape)

po = np.sum(Y_train, axis=0)/Y_train.shape[0]
po[po >= 0.5] = 1
po[po < 0.5] = 0

Y_pred_base = np.tile(po, (Y_test.shape[0],1))
Y_pred_base[Y_pred_base >= 0.5] = 1
Y_pred_base[Y_pred_base < 0.5] = 0

confusion_matrix_base = [sklearn.metrics.confusion_matrix(Y_test[:,i], Y_pred_base[:,i]) for i in range(5)]
accuracy_base = [np.sum(np.trace(cm))/np.sum(cm) for cm in confusion_matrix_base]
auc_base = [sklearn.metrics.roc_auc_score(Y_test[:,i], Y_pred_base[:,i]) for i in range(5)]

print(f'\n BASELINE MODEL: \n')
for i in range(len(confusion_matrix_base)):
    print(confusion_matrix_base[i], np.round(accuracy_base[i], 2), np.round(auc_base[i], 2))

#random model
Y_pred_random = np.random.random(Y_test.shape)
Y_pred_random[Y_pred_random >= 0.5] = 1
Y_pred_random[Y_pred_random < 0.5] = 0

confusion_matrix_random = [sklearn.metrics.confusion_matrix(Y_test[:,i], Y_pred_random[:,i]) for i in range(5)]
accuracy_random = [np.sum(np.trace(cm))/np.sum(cm) for cm in confusion_matrix_random]
auc_random = [sklearn.metrics.roc_auc_score(Y_test[:,i], Y_pred_random[:,i]) for i in range(5)]

print(f'\n RANDOM MODEL: \n')
for i in range(len(confusion_matrix_random)):
    print(confusion_matrix_random[i], np.round(accuracy_random[i], 2), np.round(auc_random[i], 2))

231
(6480, 5, 5, 5, 3) (2760, 5, 5, 5, 3)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 5, 5, 5, 3)        0         
_________________________________________________________________
conv3d (Conv3D)              (None, 5, 5, 5, 64)       5248      
_________________________________________________________________
conv3d_1 (Conv3D)            (None, 5, 5, 5, 32)       55328     
_________________________________________________________________
conv3d_2 (Conv3D)            (None, 5, 5, 5, 16)       13840     
_________________________________________________________________
max_pooling3d (MaxPooling3D) (None, 2, 2, 2, 16)       0         
_________________________________________________________________
conv3d_3 (Conv3D)            (None, 2, 2, 2, 16)       6928      
_________________________________________________________________
conv3d_4 (Conv3D)            (None