# Model Development of EnChINN

This notebook incorporates the architecture of all deep learning models used before the best performing EnChINN is identified.

Take note:
1. Default RNA_GE_window_size is 200 bp.
2. Default RNA_GE_bp_length is 20000 bp long.
3. Default DNA_feature_bp_length is 4000 bp.

4. There are two train-test-split method in the function “run_experiment_on_model”: Chrom-split and Random-Split

5. All models are designed to take both RNA and DNA features as inputs, for the ease of development of models.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
import time
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, roc_auc_score, recall_score, ConfusionMatrixDisplay, average_precision_score, f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_validate, KFold
from scipy.stats import randint
from sklearn.utils import resample

import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
import tensorflow_io as tfio
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM, Concatenate, Bidirectional, Conv1D, LeakyReLU, MaxPooling1D, Flatten, Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback

In [None]:
RNA_GE_window_size = 200
RNA_GE_bp_length = 20000    # adjust
RNA_GE_feature_length = RNA_GE_bp_length // RNA_GE_window_size

DNA_feature_bp_length = 4000

RNA_GE_left_columns_to_include = ['L' + str(i) for i in range(100 - RNA_GE_feature_length//2, 100 + RNA_GE_feature_length//2)]
RNA_GE_right_columns_to_include = ['R' + str(i) for i in range(100 - RNA_GE_feature_length//2, 100 + RNA_GE_feature_length//2)]

print(RNA_GE_left_columns_to_include)
print(RNA_GE_right_columns_to_include)

all_datasets = ["HelaS3", "K562", "IMR90","GM12878"]

def load_dataset(name:str):
    dataset_paths = {
        "HelaS3": "/kaggle/input/fuying-data/fuying_HelaS3_out_fea_DNA.csv",
        "K562": "/kaggle/input/fuying-data/fuying_K562_out_fea_DNA.csv",
        "IMR90": "/kaggle/input/fuying-data/fuying_IMR90_out_fea_DNA.csv",
        "GM12878": "/kaggle/input/fuying-data/fuying_GM12878_out_fea_DNA.csv"
    }

    dataset_RNA_20kbp_paths = {
        "HelaS3": "/kaggle/input/fuying-data/fuying_HelaS3_out_fea_20kbp.csv",
        "K562": "/kaggle/input/fuying-data/fuying_K562_out_fea_20kbp.csv",
        "IMR90": "/kaggle/input/fuying-data/fuying_IMR90_out_fea_20kbp.csv",
        "GM12878": "/kaggle/input/fuying-data/fuying_GM12878_out_fea_20kbp.csv"
    }
  

    if name not in dataset_paths.keys():
        print(f"Dataset {name} not found.")
        return None

    DNA_feature_columns = pd.read_csv(dataset_paths[name])[['left_anchor_sequence', 'right_anchor_sequence']]
    RNA_dataframe = pd.read_csv(dataset_RNA_20kbp_paths[name])
    dataframe = pd.concat([RNA_dataframe, DNA_feature_columns], axis = 1)
    
    print(f"\n{name} loaded")  
    neg, pos = np.bincount(dataframe["label"])
    total = neg + pos
    baseline_auprc = pos / total
    print(f"Baseline auprc: {baseline_auprc}")

    return dataframe

In [None]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(threshold=0.5),
    tf.keras.metrics.AUC(name = 'auroc'),
    tf.keras.metrics.AUC(name = 'auprc', curve = 'PR'),
]

BATCH_SIZE = 128

LOSS = 'binary_crossentropy'

OPTIMIZER = "adam"

class TimestampCallback(Callback):
    def __init__(self, metric_name = "duration"):
        self.__epoch_start = None
        self.__metric_name = metric_name
        self.__number_epoch = 0

    def on_epoch_begin(self, epoch, logs=None):
        self.__number_epoch += 1
        if self.__number_epoch % 10 == 0:
            print(f"Epoch {self.__number_epoch}")
        self.__epoch_start = time.time()

    def on_epoch_end(self, epoch, logs=None):
        logs[self.__metric_name] = time.time() - self.__epoch_start

In [None]:
def DNA_sequence_to_indices(sequence):
    sequence = sequence.upper()
    if len(sequence) < 4000:
        sequence = sequence + "N" * (4000 - len(sequence))
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    indices = [mapping.get(i, 4) for i in sequence]
    return indices

import os
import shutil

def run_experiment_on_model(models, repeat = 1):
    for dataset_name in all_datasets:
        data = load_dataset(dataset_name)
        
        # Chrom-split
        data_test = data[data["left_anchor"].isin(["chr4", "chr7", "chr8", "chr11"])]
        data_train = data.drop(data_test.index)

        print("Train: " + str(len(data_train)) + " " + str(len(data_train)/len(data)))
        print("Test: " + str(len(data_test)) + " " + str(len(data_test)/len(data)))
        
        X_RNA_left = np.array(data[RNA_GE_left_columns_to_include])
        X_RNA_right = np.array(data[RNA_GE_right_columns_to_include])
        X_RNA_left_train = np.array(data_train[RNA_GE_left_columns_to_include])
        X_RNA_right_train = np.array(data_train[RNA_GE_right_columns_to_include])
        X_RNA_left_test = np.array(data_test[RNA_GE_left_columns_to_include])
        X_RNA_right_test = np.array(data_test[RNA_GE_right_columns_to_include])

        X_DNA_left = np.array([DNA_sequence_to_indices(sequence) for sequence in data['left_anchor_sequence']])
        X_DNA_right = np.array([DNA_sequence_to_indices(sequence) for sequence in data['right_anchor_sequence']])
        X_DNA_left_train = np.array([DNA_sequence_to_indices(sequence) for sequence in data_train['left_anchor_sequence']])
        X_DNA_right_train = np.array([DNA_sequence_to_indices(sequence) for sequence in data_train['right_anchor_sequence']])
        X_DNA_left_test = np.array([DNA_sequence_to_indices(sequence) for sequence in data_test['left_anchor_sequence']])
        X_DNA_right_test = np.array([DNA_sequence_to_indices(sequence) for sequence in data_test['right_anchor_sequence']])

        y = data['label'].astype(np.float32)
        y_train = data_train['label'].astype(np.float32)
        y_test = data_test['label'].astype(np.float32)

        # Random-split
#         (X_RNA_left_train, 
#          X_RNA_left_test, 
#          X_RNA_right_train,
#          X_RNA_right_test,
#          X_DNA_left_train,
#          X_DNA_left_test,
#          X_DNA_right_train,
#          X_DNA_right_test,
#          y_train, 
#          y_test) = train_test_split(
#             X_RNA_left,
#             X_RNA_right,
#             X_DNA_left,
#             X_DNA_right, y, test_size=0.2, random_state=42)
        
#         print("Train: " + str(len(X_RNA_left_train)) + " " + str(len(X_RNA_right_train)) + " " + str(len(X_DNA_left_train)))
#         print("Test: " + str(len(X_RNA_left_test)) + " " + str(len(X_RNA_right_test)) + " " + str(len(X_DNA_left_test)))
        
        for model_name, model_method in models.items():
            experiment_name = dataset_name + "_" + model_name
            print("\nExperiment :", experiment_name)
            
            max_auprc = 0
            json_record = {}
            
            for i in range(repeat):
                print(f"Run {i}")
                
                checkpoint = ModelCheckpoint(f'/kaggle/working/temp.hdf5', monitor='val_auprc', mode = "max", save_best_only=True)
                early_stop = EarlyStopping(monitor='val_auprc', mode = "max", patience=30)
                timestampcallback = TimestampCallback()

                model = model_method["function"](**model_method["arguments"])
                model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=METRICS)
                
                # In order to ease development of models, each model would takes DNA and RNA-Seq GE as inputs
                history = model.fit([X_RNA_left_train, X_RNA_right_train, X_DNA_left_train, X_DNA_right_train], y_train, 
                    batch_size = BATCH_SIZE, 
                    shuffle = True,
                    epochs=200, 
                    verbose = 0,
                    callbacks = [early_stop, checkpoint, timestampcallback],
                    validation_data = ([X_RNA_left_test, X_RNA_right_test, X_DNA_left_test, X_DNA_right_test], y_test),
                   )
                
                print(f"Current best auprc: {max_auprc}")
                print(f"This run best auprc: " + str(max(history.history["val_auprc"])))
                
                # Saving model with highest val auprc
                if max(history.history["val_auprc"]) > max_auprc:
                    shutil.copy('/kaggle/working/temp.hdf5', f"/kaggle/working/{experiment_name}.hdf5")
                    max_auprc = max(history.history["val_auprc"])
                    print("    Best Run!")
                
                # Testing
                model = keras.models.load_model('/kaggle/working/temp.hdf5')
                eval_scores = model.evaluate([X_RNA_left_test, X_RNA_right_test, X_DNA_left_test, X_DNA_right_test], y_test)
                test_scores = {
                    "test_loss": eval_scores[0],
                    "test_accuracy": eval_scores[1],
                    "test_auroc": eval_scores[2],
                    "test_auprc": eval_scores[3],
                }
                
                y_pred_prob = model.predict([X_RNA_left_test, X_RNA_right_test, X_DNA_left_test, X_DNA_right_test])
                y_pred = y_pred_prob >= 0.5
                y_test = tf.constant(y_test)
                test_scores["test_f1"] = f1_score(y_test, y_pred)
                
                print(test_scores)
                
                # Evaluate on the full dataset
                print("Evaluate scores on full dataset (train + test)")
                model.evaluate([[X_RNA_left, X_RNA_right, X_DNA_left, X_DNA_right]], y, batch_size = BATCH_SIZE)

                # Save result
                history.history.update(test_scores)
                
                json_record[i] = history.history
                
                # Remove temp models
                os.remove('/kaggle/working/temp.hdf5')
            
            with open(f"/kaggle/working/{experiment_name}.json", "w") as outfile:
                outfile.write(json.dumps(json_record, indent=4))


In [None]:
def consolidate_result(models):
    df = pd.DataFrame()
    for dataset_name in all_datasets:
        for model_name in models.keys():
            experiment_name = dataset_name + '_' + model_name
            
            with open(f"/kaggle/working/{experiment_name}.json", "r") as file:
                content = pd.read_json(file).transpose()
                
            row = {}
            row["experiment"] = experiment_name
            row["dataset"] = dataset_name
            row["len_RNA"] = RNA_GE_bp_length
            row["model"] = model_name
            
            row["best test accuracy"] = round(max(content["test_accuracy"]), 4)
            row["best test f1"] = round(max(content["test_f1"]), 4)
            row["best test auroc"] = round(max(content["test_auroc"]), 4)
            
            row["best test auprc"] = round(max(content["test_auprc"]), 4)
            row["test auprc"] = [round(i, 4) for i in content["test_auprc"]]
            row["test auprc range"] = [round(min(content["test_auprc"]), 4),
                                          round(max(content["test_auprc"]), 4)]
            row["avg test auprc"] = round(sum(content["test_auprc"])/len(content["test_auprc"]), 4)
            
            row["avg # epochs"] = content["duration"].apply(lambda x:len(x)).mean()
            row["avg time per epochs"] = content["duration"].apply(lambda x:sum(x)/len(x)).mean()
            
            if len(df) == 0:
                df = pd.DataFrame.from_dict([row])
            else:
                df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
                
    return df

In [None]:
def DNA_embedding_layer():
    vocab_size = 5
    emb_dim = 4

    # Initialize the embedding matrix as a numpy array of zeros.
    emb_matrix = np.matrix([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0.25, 0.25, 0.25, 0.25]])

    # Define Keras embedding layer with the correct input and output sizes
    embedding_layer = Embedding(vocab_size, emb_dim, trainable = False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer.
    embedding_layer.build((None,))

    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])

    return embedding_layer

# https://keras.io/examples/nlp/text_classification_with_transformer/
@keras.saving.register_keras_serializable(package="MyLayers", name = "TransformerEncoderBlock")
class TransformerEncoderBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate
        
        self.att = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_dim)
        self.ffn = Sequential(
            [layers.Dense(self.ff_dim, activation="relu"), layers.Dense(self.embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(self.rate)
        self.dropout2 = layers.Dropout(self.rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        return {
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate
        }
    
@keras.saving.register_keras_serializable(package="MyLayers", name = "PositionEmbedding")
class PositionEmbedding(layers.Layer):
    def __init__(self, maxlen, embed_dim):
        super().__init__()
        self.maxlen = maxlen
        self.embed_dim = embed_dim
        self.pos_emb = layers.Embedding(input_dim=self.maxlen, output_dim=self.embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-2]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions
    
    def get_config(self):
        return {
            "maxlen": self.maxlen,
            "embed_dim": self.embed_dim
        }

@keras.saving.register_keras_serializable(package="MyLayers", name = "TokenAndPositionEmbedding")
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, embed_dim):
        super().__init__()
        self.token_emb = DNA_embedding_layer()
        self.maxlen = maxlen
        self.embed_dim = embed_dim
        self.pos_emb = layers.Embedding(input_dim=self.maxlen, output_dim=self.embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
    def get_config(self):
        return {
            "maxlen": self.maxlen,
            "embed_dim": self.embed_dim
        }

# https://keras.io/examples/nlp/text_classification_with_transformer/
# Inspired by EPIANN
@keras.saving.register_keras_serializable(package="MyLayers", name = "SoftAttention")
class SoftAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads, rate=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.att = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_dim)
        self.kernel = self.add_weight(
            "kernel",
            shape=[self.embed_dim, self.embed_dim],
            trainable=True,
        )

    def call(self, inputs_1, inputs_2, training):
        attn_output_1 = self.att(inputs_1, inputs_2)
        attn_output_2 = self.att(inputs_2, inputs_1)
        similarity_matrix = tf.matmul(tf.matmul(attn_output_1, self.kernel), attn_output_2, transpose_b = True)
        return similarity_matrix
    
    def get_config(self):
        return {
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
        }

# RNA GE only

## Model 1 - Simple Feed Forward NN for RNA GE

In [None]:
def model_1(hidden_layer_units, dropout = 0.0, output_bias = None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
   
    # Inputs
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    x_left_flattened = Flatten()(input_RNA_left)
    x_right_flattened = Flatten()(input_RNA_right)
    
    x = Concatenate()([x_left_flattened, x_right_flattened])
    
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)
    
    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)
    
    return model

In [None]:
# model = model_1(hidden_layer_units = [64, 32], dropout = 0.3)
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

## Model 2 - LSTM for RNA GE

In [None]:
## Model 2
def model_2(lstm_layer_units, hidden_layer_units, dropout = 0.0, recurrent_dropout = 0.0, output_bias = None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
   
    # Inputs
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    x_left = input_RNA_left
    x_right = input_RNA_right
    
    for i in range(len(lstm_layer_units) - 1):
        lstm = Bidirectional(LSTM(lstm_layer_units[i], return_sequences=True))
        x_left = lstm(x_left)
        x_right = lstm(x_right)
    
    lstm = Bidirectional(LSTM(lstm_layer_units[-1], return_sequences=False))
    x_left = lstm(x_left)
    x_right = lstm(x_right)
    
    x = Concatenate()([x_left, x_right])
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)
    
    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)  
    
    return model

In [None]:
# model = model_2(lstm_layer_units = [16, 16], hidden_layer_units = [64, 32], dropout = 0.3)
# model.summary()
# # tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR", to_file='model.png')
# tf.keras.utils.plot_model(model, show_shapes=True, to_file='model.png')

## Model 3 - CNN for RNA GE

In [None]:
## Model 3

def model_3(filters, hidden_layer_units, dropout = 0.0, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    x_left = input_RNA_left
    x_right = input_RNA_right
    
    for i in range(len(filters)):
        conv = Conv1D(filters[i], kernel_size = 4)
        leakyrelu = LeakyReLU(0.128)
#         maxpool = MaxPooling1D(pool_size=2, strides=1)
        maxpool = MaxPooling1D(pool_size=2)
        x_left = maxpool(leakyrelu(conv(x_left)))
        x_right = maxpool(leakyrelu(conv(x_right)))
    
    x_left_flattened = Flatten()(x_left)
    x_right_flattened = Flatten()(x_right)
    
    x = Concatenate()([x_left_flattened, x_right_flattened])
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)

    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)

    return model

In [None]:
# model = model_3(filters = [16, 32], hidden_layer_units = [1024, 512, 128, 32], dropout = 0.3)
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

## Model 7 - Transformer Encoder for RNA GE

In [None]:
## Model 7

def model_7(encoder_num_heads, encoder_ff_dim, hidden_layer_units, dropout = 0.0, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    position_embedding_layer = PositionEmbedding(maxlen = RNA_GE_feature_length, embed_dim = 1)
    x_left = position_embedding_layer(input_RNA_left)
    x_right = position_embedding_layer(input_RNA_right)
    
    for i in range(len(encoder_num_heads)):
        transformer_encoder_block = TransformerEncoderBlock(embed_dim = 1, num_heads = encoder_num_heads[i], ff_dim = encoder_ff_dim[i])
        x_left = transformer_encoder_block(x_left)
        x_right = transformer_encoder_block(x_right)
    
    x_left_flattened = Flatten()(x_left)
    x_right_flattened = Flatten()(x_right)
    
    x = Concatenate()([x_left_flattened, x_right_flattened])
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)

    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)

    return model

In [None]:
# model = model_7(
#     encoder_num_heads = [2],
#     encoder_ff_dim = [8],
#     hidden_layer_units = [64, 32],
#     dropout = 0.3,
# )
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

## Model 10 LSTM + Transformer Encoder for RNA GE

In [None]:
## Model 10
def model_10(lstm_layer_units, encoder_num_heads, encoder_ff_dim, hidden_layer_units, dropout = 0.0, recurrent_dropout = 0.0, output_bias = None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
   
    # Inputs
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    x_left = input_RNA_left
    x_right = input_RNA_right
    
    for i in range(len(lstm_layer_units)):
        lstm = Bidirectional(LSTM(lstm_layer_units[i], return_sequences=True))
        x_left = lstm(x_left)
        x_right = lstm(x_right)
    
    position_embedding_layer = PositionEmbedding(maxlen = RNA_GE_feature_length, embed_dim = lstm_layer_units[-1] * 2)
    x_left = position_embedding_layer(x_left)
    x_right = position_embedding_layer(x_right)
    
    for i in range(len(encoder_num_heads)):
        transformer_encoder_block = TransformerEncoderBlock(embed_dim = 1, num_heads = encoder_num_heads[i], ff_dim = encoder_ff_dim[i])
        x_left = transformer_encoder_block(x_left)
        x_right = transformer_encoder_block(x_right)

    lstm = Bidirectional(LSTM(lstm_layer_units[0], return_sequences=False))
    x_left = lstm(x_left)
    x_right = lstm(x_right)
    x = Concatenate()([x_left, x_right])
    
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)
    
    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)
    
    return model

In [None]:
# model = model_10(
#     lstm_layer_units = [8],
#     encoder_num_heads = [2],
#     encoder_ff_dim = [8],
#     hidden_layer_units = [64, 32],
#     dropout = 0.3,
# )
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

## Model 13 - LSTM + Soft Attention for RNA GE

In [None]:
def model_13(lstm_layer_units, encoder_num_heads, hidden_layer_units, dropout = 0.0, recurrent_dropout = 0.0, output_bias = None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)

    # Inputs
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    x_left = input_RNA_left
    x_right = input_RNA_right
    
    for i in range(len(lstm_layer_units)):
        lstm = Bidirectional(LSTM(lstm_layer_units[i], return_sequences=True))
        x_left = lstm(x_left)
        x_right = lstm(x_right)
        
    soft_attention = SoftAttention(embed_dim = lstm_layer_units[-1] * 2, num_heads = 2)
    x = soft_attention(x_left, x_right)
    
    x = Flatten()(x)
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)
    
    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)
    
    return model

In [None]:
# model = model_13(
#     lstm_layer_units = [16, 16],
#     encoder_num_heads = [2],
#     hidden_layer_units = [256, 64, 32],
#     dropout = 0.3,
# )
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

## Model 14 - CNN + Soft Attention for RNA GE

In [None]:
def model_14(filters, encoder_num_heads, hidden_layer_units, dropout = 0.0, recurrent_dropout = 0.0, output_bias = None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    x_left = input_RNA_left
    x_right = input_RNA_right
    
    for i in range(len(filters)):
        conv = Conv1D(filters[i], kernel_size = 4)
        leakyrelu = LeakyReLU(0.128)
        maxpool = MaxPooling1D(pool_size=2)
        x_left = maxpool(leakyrelu(conv(x_left)))
        x_right = maxpool(leakyrelu(conv(x_right)))
        
    soft_attention = SoftAttention(embed_dim = filters[-1], num_heads = 2)
    x = soft_attention(x_left, x_right)
    
    x = Flatten()(x)
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)
    
    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)
    
    return model

In [None]:
# model = model_14(
#     filters = [16, 32],
#     encoder_num_heads = [2],
#     hidden_layer_units = [256, 64, 32],
#     dropout = 0.3,
# )
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

## Model 15 - CNN + Transformer Encoder for RNA GE

In [None]:
## Model 15
def model_15(filters, encoder_num_heads, encoder_ff_dim, hidden_layer_units, dropout = 0.0, recurrent_dropout = 0.0, output_bias = None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
   
    # Inputs
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    x_left = input_RNA_left
    x_right = input_RNA_right
    
    for i in range(len(filters)):
        conv = Conv1D(filters[i], kernel_size = 4)
        leakyrelu = LeakyReLU(0.128)
        maxpool = MaxPooling1D(pool_size=2)
        x_left = maxpool(leakyrelu(conv(x_left)))
        x_right = maxpool(leakyrelu(conv(x_right)))
        
    position_embedding_layer = PositionEmbedding(maxlen = x_right.shape[1], embed_dim = x_right.shape[2])
    x_left = position_embedding_layer(x_left)
    x_right = position_embedding_layer(x_right)
    
    for i in range(len(encoder_num_heads)):
        transformer_encoder_block = TransformerEncoderBlock(embed_dim = 1, num_heads = encoder_num_heads[i], ff_dim = encoder_ff_dim[i])
        x_left = transformer_encoder_block(x_left)
        x_right = transformer_encoder_block(x_right)
    
    x_left_flattened = Flatten()(x_left)
    x_right_flattened = Flatten()(x_right)
    x = Concatenate()([x_left_flattened, x_right_flattened])
    
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)
    
    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)    
    
    return model

In [None]:
# model = model_15(
#     filters = [16, 32],
#     encoder_num_heads = [2],
#     encoder_ff_dim = [8],
#     hidden_layer_units = [1024, 512, 128, 32],
#     dropout = 0.3,
# )

# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

# DNA Only

Models developed in this part are not included in the report.

## Model 4 - LSTM for DNA Only

In [None]:
## Model 4

def model_4(lstm_layer_units, hidden_layer_units, dropout = 0.0, recurrent_dropout = 0.0, output_bias = None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
   
    # Inputs
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    embedding_layer = DNA_embedding_layer()
    x_left = embedding_layer(input_DNA_left)
    x_right = embedding_layer(input_DNA_right)
    
    for i in range(len(lstm_layer_units) - 1):
        lstm = Bidirectional(LSTM(lstm_layer_units[i], return_sequences=True))
        x_left = lstm(x_left)
        x_right = lstm(x_right)
    
    lstm = Bidirectional(LSTM(lstm_layer_units[-1], return_sequences=False))
    x_left = lstm(x_left)
    x_right = lstm(x_right)
    
    x = Concatenate()([x_left, x_right])
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)
    
    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)
    
    return model


In [None]:
# model = model_4(lstm_layer_units = [32], hidden_layer_units = [64, 32], dropout = 0.3)
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

## Model 5 - CNN for DNA Only

In [None]:
## Model 5

def model_5(filters, hidden_layer_units, dropout = 0.0, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    embedding_layer = DNA_embedding_layer()
    x_left = embedding_layer(input_DNA_left)
    x_right = embedding_layer(input_DNA_right)
    
    for i in range(len(filters)):
        conv = Conv1D(filters[i], kernel_size = 8)
        leakyrelu = LeakyReLU(0.128)
        maxpool = MaxPooling1D(pool_size=8, strides=4)
        x_left = maxpool(leakyrelu(conv(x_left)))
        x_right = maxpool(leakyrelu(conv(x_right)))
    
    x_left_flattened = Flatten()(x_left)
    x_right_flattened = Flatten()(x_right)
    
    x = Concatenate()([x_left_flattened, x_right_flattened])
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)

    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)

    return model

In [None]:
# model = model_5(filters = [64, 128, 64, 32], hidden_layer_units = [512, 256, 32], dropout = 0.3)
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

## Model 8 - Transformer Encoder for DNA

In [None]:
## Model 8

def model_8(encoder_num_heads, encoder_ff_dim, hidden_layer_units, dropout = 0.0, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    token_and_position_embedding_layer = TokenAndPositionEmbedding(maxlen = 4000, embed_dim = 4)
    x_left = token_and_position_embedding_layer(input_DNA_left)
    x_right = token_and_position_embedding_layer(input_DNA_right)
    
    x_left = layers.AveragePooling1D(pool_size = 10)(x_left)
    x_right = layers.AveragePooling1D(pool_size = 10)(x_right)
    
    for i in range(len(encoder_num_heads)):
        transformer_encoder_block = TransformerEncoderBlock(embed_dim = 1, num_heads = encoder_num_heads[i], ff_dim = encoder_ff_dim[i])
        x_left = transformer_encoder_block(x_left)
        x_right = transformer_encoder_block(x_right)
    
    x_left = layers.GlobalAveragePooling1D()(x_left)
    x_right = layers.GlobalAveragePooling1D()(x_right)
    
    x_left_flattened = Flatten()(x_left)
    x_right_flattened = Flatten()(x_right)
    
    x = Concatenate()([x_left_flattened, x_right_flattened])
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)

    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)

    return model

In [None]:
# model = model_8(encoder_num_heads = [2], encoder_ff_dim = [8], hidden_layer_units = [64, 32], dropout = 0.3)
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

## Model 11 - CNN + Transformer Encoder for DNA

In [None]:
## Model 11

def model_11(filters, encoder_num_heads, encoder_ff_dim, hidden_layer_units, dropout = 0.0, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    token_and_position_embedding_layer = TokenAndPositionEmbedding(maxlen = 4000, embed_dim = 4)
    x_left = token_and_position_embedding_layer(input_DNA_left)
    x_right = token_and_position_embedding_layer(input_DNA_right)
    
    for i in range(len(filters)):
        conv = Conv1D(filters[i], kernel_size = 8, strides = 8)
        leakyrelu = LeakyReLU(0.128)
        x_left = leakyrelu(conv(x_left))
        x_right = leakyrelu(conv(x_right))
    
    for i in range(len(encoder_num_heads)):
        transformer_encoder_block = TransformerEncoderBlock(embed_dim = 1, num_heads = encoder_num_heads[i], ff_dim = encoder_ff_dim[i])
        x_left = transformer_encoder_block(x_left)
        x_right = transformer_encoder_block(x_right)
    
    x_left = layers.GlobalAveragePooling1D()(x_left)
    x_right = layers.GlobalAveragePooling1D()(x_right)
    
    x_left_flattened = Flatten()(x_left)
    x_right_flattened = Flatten()(x_right)
    
    x = Concatenate()([x_left_flattened, x_right_flattened])
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)

    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)

    return model

In [None]:
# model = model_11(filters = [64], encoder_num_heads = [2], encoder_ff_dim = [8], hidden_layer_units = [64, 32], dropout = 0.3)
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

# Combined

Models developed in this part are not included in the report.

## Model 6: RNA GE - LSTM + DNA - CNN

In [None]:
## Model 6

def model_6(hidden_layer_units, lstm_layer_units, DNA_hidden_layer_units, DNA_conv_filters, dropout = 0.0, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    # RNA
    x_RNA_left = input_RNA_left
    x_RNA_right = input_RNA_right
    
    for i in range(len(lstm_layer_units) - 1):
        lstm = Bidirectional(LSTM(lstm_layer_units[i], return_sequences=True))
        x_RNA_left = lstm(x_RNA_left)
        x_RNA_right = lstm(x_RNA_right)
    
    lstm = Bidirectional(LSTM(lstm_layer_units[-1], return_sequences=False))
    x_RNA_left = lstm(x_RNA_left)
    x_RNA_right = lstm(x_RNA_right)
    
    # DNA
    embedding_layer = DNA_embedding_layer()
    x_DNA_left = embedding_layer(input_DNA_left)
    x_DNA_right = embedding_layer(input_DNA_right)
    
    for i in range(len(DNA_conv_filters)):
        conv = Conv1D(DNA_conv_filters[i], kernel_size = 8)
        leakyrelu = LeakyReLU(0.128)
        maxpool = MaxPooling1D(pool_size=8, strides=4)
        x_DNA_left = maxpool(leakyrelu(conv(x_DNA_left)))
        x_DNA_right = maxpool(leakyrelu(conv(x_DNA_right)))
    
    x_DNA_left = Flatten()(x_DNA_left)
    x_DNA_right = Flatten()(x_DNA_right)
    
    for i in range(len(DNA_hidden_layer_units)):
        x_DNA_left = Dense(units = DNA_hidden_layer_units[i], activation = "relu")(x_DNA_left)
        if dropout:
            x_DNA_left = Dropout(rate = dropout)(x_DNA_left)
    
    for i in range(len(DNA_hidden_layer_units)):
        x_DNA_right = Dense(units = DNA_hidden_layer_units[i], activation = "relu")(x_DNA_right)
        if dropout:
            x_DNA_right = Dropout(rate = dropout)(x_DNA_right)
    
    x = Concatenate()([x_RNA_left, x_RNA_right, x_DNA_left, x_DNA_right])
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)

    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)

    return model

In [None]:
# model = model_6(
#     hidden_layer_units = [256, 64, 32], 
#     lstm_layer_units = [8, 8], 
#     DNA_hidden_layer_units = [256, 64, 16], 
#     DNA_conv_filters = [64, 128, 64, 32],
#     dropout = 0.3,
# )
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")
# # tf.keras.utils.plot_model(model, show_shapes=True)


## Model 9: RNA GE - FNN + DNA - CNN

In [None]:
## Model 9

def model_9(hidden_layer_units, DNA_hidden_layer_units, DNA_conv_filters, dropout = 0.0, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    # RNA
    x_RNA_left = Flatten()(input_RNA_left)
    x_RNA_right = Flatten()(input_RNA_right)
    
    # DNA
    embedding_layer = DNA_embedding_layer()
    x_DNA_left = embedding_layer(input_DNA_left)
    x_DNA_right = embedding_layer(input_DNA_right)
    
    for i in range(len(DNA_conv_filters)):
        conv = Conv1D(DNA_conv_filters[i], kernel_size = 8)
        leakyrelu = LeakyReLU(0.128)
        maxpool = MaxPooling1D(pool_size=8, strides=4)
        x_DNA_left = maxpool(leakyrelu(conv(x_DNA_left)))
        x_DNA_right = maxpool(leakyrelu(conv(x_DNA_right)))
    
    x_DNA_left = Flatten()(x_DNA_left)
    x_DNA_right = Flatten()(x_DNA_right)
    
    for i in range(len(DNA_hidden_layer_units)):
        x_DNA_left = Dense(units = DNA_hidden_layer_units[i], activation = "relu")(x_DNA_left)
        if dropout:
            x_DNA_left = Dropout(rate = dropout)(x_DNA_left)
    
    for i in range(len(DNA_hidden_layer_units)):
        x_DNA_right = Dense(units = DNA_hidden_layer_units[i], activation = "relu")(x_DNA_right)
        if dropout:
            x_DNA_right = Dropout(rate = dropout)(x_DNA_right)
    
    x = Concatenate()([x_RNA_left, x_RNA_right, x_DNA_left, x_DNA_right])
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)

    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)

    return model

In [None]:
# model = model_9(
#     hidden_layer_units = [256, 64, 32], 
#     DNA_hidden_layer_units = [256, 64, 16], 
#     DNA_conv_filters = [64, 128, 64, 32],
#     dropout = 0.3,
# )
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")
# # tf.keras.utils.plot_model(model, show_shapes=True)


## Model 12: RNA GE - LSTM, Encoder + DNA - CNN

In [None]:
## Model 12

def model_12(hidden_layer_units, lstm_layer_units, encoder_num_heads, encoder_ff_dim, DNA_hidden_layer_units, DNA_conv_filters, dropout = 0.0, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((4000, ), dtype = 'int32')
    input_DNA_right = Input((4000, ), dtype = 'int32')
    
    # RNA
    x_RNA_left = input_RNA_left
    x_RNA_right = input_RNA_right
    
    for i in range(len(lstm_layer_units) - 1):
        lstm = Bidirectional(LSTM(lstm_layer_units[i], return_sequences=True))
        x_RNA_left = lstm(x_RNA_left)
        x_RNA_right = lstm(x_RNA_right)
    
    position_embedding_layer = PositionEmbedding(maxlen = RNA_GE_feature_length, embed_dim = lstm_layer_units[-1] * 2)
    x_RNA_left = position_embedding_layer(x_RNA_left)
    x_RNA_right = position_embedding_layer(x_RNA_right)
    
    for i in range(len(encoder_num_heads)):
        transformer_encoder_block = TransformerEncoderBlock(embed_dim = 1, num_heads = encoder_num_heads[i], ff_dim = encoder_ff_dim[i])
        x_RNA_left = transformer_encoder_block(x_RNA_left)
        x_RNA_right = transformer_encoder_block(x_RNA_right)
   
    
    lstm = Bidirectional(LSTM(lstm_layer_units[0], return_sequences=False))
    x_RNA_left = lstm(x_RNA_left)
    x_RNA_right = lstm(x_RNA_right)
    
    # DNA
    embedding_layer = DNA_embedding_layer()
    x_DNA_left = embedding_layer(input_DNA_left)
    x_DNA_right = embedding_layer(input_DNA_right)
    
    for i in range(len(DNA_conv_filters)):
        conv = Conv1D(DNA_conv_filters[i], kernel_size = 8)
        leakyrelu = LeakyReLU(0.128)
        maxpool = MaxPooling1D(pool_size=8, strides=4)
        x_DNA_left = maxpool(leakyrelu(conv(x_DNA_left)))
        x_DNA_right = maxpool(leakyrelu(conv(x_DNA_right)))
    
    x_DNA_left = Flatten()(x_DNA_left)
    x_DNA_right = Flatten()(x_DNA_right)
    
    for i in range(len(DNA_hidden_layer_units)):
        x_DNA_left = Dense(units = DNA_hidden_layer_units[i], activation = "relu")(x_DNA_left)
        if dropout:
            x_DNA_left = Dropout(rate = dropout)(x_DNA_left)
    
    for i in range(len(DNA_hidden_layer_units)):
        x_DNA_right = Dense(units = DNA_hidden_layer_units[i], activation = "relu")(x_DNA_right)
        if dropout:
            x_DNA_right = Dropout(rate = dropout)(x_DNA_right)
    
    x = Concatenate()([x_RNA_left, x_RNA_right, x_DNA_left, x_DNA_right])
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)

    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)

    return model

In [None]:
# model = model_12(
#     lstm_layer_units = [8], 
#     encoder_num_heads = [2],
#     encoder_ff_dim = [8],
    
#     DNA_hidden_layer_units = [256, 64, 16], 
#     DNA_conv_filters = [64, 128, 64, 32],
    
#     hidden_layer_units = [256, 64, 32], 
#     dropout = 0.3,
# )
# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")
# # tf.keras.utils.plot_model(model, show_shapes=True)


## Model 16 - RNA (CNN+Encoder) + DNA (CNN)

In [None]:
## Model 16
def model_16(filters, encoder_num_heads, encoder_ff_dim, DNA_conv_filters, hidden_layer_units, dropout = 0.0, recurrent_dropout = 0.0, output_bias = None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
   
    # Inputs
    input_RNA_left = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_RNA_right = Input((RNA_GE_feature_length, 1), dtype = 'float64')
    input_DNA_left = Input((DNA_feature_bp_length, ), dtype = 'int32')
    input_DNA_right = Input((DNA_feature_bp_length, ), dtype = 'int32')
    
    # RNA
    x_RNA_left = input_RNA_left
    x_RNA_right = input_RNA_right
    
    for i in range(len(filters)):
        conv = Conv1D(filters[i], kernel_size = 4)
        leakyrelu = LeakyReLU(0.128)
        maxpool = MaxPooling1D(pool_size=2, strides=1)
        maxpool = MaxPooling1D(pool_size=2)
        x_RNA_left = maxpool(leakyrelu(conv(x_RNA_left)))
        x_RNA_right = maxpool(leakyrelu(conv(x_RNA_right)))
        
    position_embedding_layer = PositionEmbedding(maxlen = x_RNA_right.shape[1], embed_dim = x_RNA_right.shape[2])
    x_RNA_left = position_embedding_layer(x_RNA_left)
    x_RNA_right = position_embedding_layer(x_RNA_right)
    
    for i in range(len(encoder_num_heads)):
        transformer_encoder_block = TransformerEncoderBlock(embed_dim = 1, num_heads = encoder_num_heads[i], ff_dim = encoder_ff_dim[i])
        x_RNA_left = transformer_encoder_block(x_RNA_left)
        x_RNA_right = transformer_encoder_block(x_RNA_right)
    
    x_RNA_left = Flatten()(x_RNA_left)
    x_RNA_right = Flatten()(x_RNA_right)

    # DNA
    embedding_layer = DNA_embedding_layer()
    x_DNA_left = embedding_layer(input_DNA_left)
    x_DNA_right = embedding_layer(input_DNA_right)
    
    for i in range(len(DNA_conv_filters)):
        conv = Conv1D(DNA_conv_filters[i], kernel_size = 8)
        leakyrelu = LeakyReLU(0.128)
        maxpool = MaxPooling1D(pool_size=8, strides=4)
        x_DNA_left = maxpool(leakyrelu(conv(x_DNA_left)))
        x_DNA_right = maxpool(leakyrelu(conv(x_DNA_right)))
    
    x_DNA_left = Flatten()(x_DNA_left)
    x_DNA_right = Flatten()(x_DNA_right)

    x = Concatenate()([x_RNA_left, x_RNA_right, x_DNA_left, x_DNA_right])
    
    for i in range(len(hidden_layer_units)):
        x = Dense(units = hidden_layer_units[i], activation = "relu")(x)
        if dropout:
            x = Dropout(rate = dropout)(x)
    
    output = Dense(1, activation = 'sigmoid', bias_initializer=output_bias)(x)
    
    model = tf.keras.Model(inputs=[input_RNA_left, input_RNA_right, input_DNA_left, input_DNA_right], outputs=output)  
    
    return model

In [None]:
# model = model_16(
#     filters = [16, 32],
#     encoder_num_heads = [2],
#     encoder_ff_dim = [8],
    
#     DNA_conv_filters = [64, 128, 64, 32],
    
#     hidden_layer_units = [1024, 512, 128, 32],
#     dropout = 0.3,
# )

# model.summary()
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")
# # tf.keras.utils.plot_model(model, show_shapes=True)

# Code Running

In [None]:
models_to_train = {
#     "model_1_1_layer_FFN":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [256],
#         }
#     },
#     "model_1_1_layer_FFN_dropout_0.3":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [32],
#             "dropout" : 0.3,
#         }
#     },
#     "model_1_1_layer_FFN_dropout_0.5":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [32],
#             "dropout" : 0.5,
#         }
#     },
#     "model_1_1_layer_FFN_dropout_0.7":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [32],
#             "dropout" : 0.7,
#         }
#     },
#     "model_1_2_layer_FFN":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [256, 128],
#         }
#     },
#     "model_1_2_layer_FFN_dropout_0.3":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [64, 32],
#             "dropout" : 0.3
#         }
#     },
#     "model_1_2_layer_FFN_dropout_0.5":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [64, 32],
#             "dropout" : 0.5
#         }
#     },
#     "model_1_2_layer_FFN_dropout_0.7":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [64, 32],
#             "dropout" : 0.7
#         }
#     },
#     "model_1_3_layer_FFN":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [256, 128, 64],
#         }
#     },
#     "model_1_3_layer_FFN_dropout_0.3":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [128, 64, 32],
#             "dropout" : 0.3
#         }
#     },
#     "model_1_3_layer_FFN_dropout_0.5":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [128, 64, 32],
#             "dropout" : 0.5
#         }
#     },
#     "model_1_3_layer_FFN_dropout_0.7":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [128, 64, 32],
#             "dropout" : 0.7
#         }
#     },
#     "model_1_4_layer_FFN":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [256, 512, 128, 32],
#         }
#     },
#     "model_1_4_layer_FFN_dropout_0.3":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [256, 512, 128, 32],
#             "dropout": 0.3
#         }
#     },
#     "model_1_4_layer_FFN_dropout_0.5":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [256, 512, 128, 32],
#             "dropout": 0.5
#         }
#     },
#     "model_1_4_layer_FFN_dropout_0.7":{
#         "function" : model_1,
#         "arguments": {
#             "hidden_layer_units" : [256, 512, 128, 32],
#             "dropout": 0.7
#         }
#     },
    
#     "model_2_1_layer_8_LSTM_4_layer_FFN":{
#         "function" : model_2,
#         "arguments": {
#             "lstm_layer_units": [8],
#             "hidden_layer_units" : [64, 128, 64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_2_1_layer_16_LSTM_4_layer_FFN":{
#         "function" : model_2,
#         "arguments": {
#             "lstm_layer_units": [16],
#             "hidden_layer_units" : [64, 128, 64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_2_2_layer_8_LSTM_4_layer_FFN":{
#         "function" : model_2,
#         "arguments": {
#             "lstm_layer_units": [8, 8],
#             "hidden_layer_units" : [64, 128, 64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_2_2_layer_16_LSTM_4_layer_FFN":{
#         "function" : model_2,
#         "arguments": {
#             "lstm_layer_units": [16, 16],
#             "hidden_layer_units" : [64, 128, 64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_2_2_layer_16_LSTM_2_layer_FFN":{
#         "function" : model_2,
#         "arguments": {
#             "lstm_layer_units": [16, 16],
#             "hidden_layer_units" : [64, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_3_1_layer_CNN_2_layer_FFN":{
#         "function" : model_3,
#         "arguments": {
#             "filters": [16],
#             "hidden_layer_units" : [128, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_3_2_layer_CNN_2_layer_FFN":{
#         "function" : model_3,
#         "arguments": {
#             "filters": [8, 16],
#             "hidden_layer_units" : [128, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_3_2_layer_16_32_CNN_2_layer_FFN":{
#         "function" : model_3,
#         "arguments": {
#             "filters": [16, 32],
#             "hidden_layer_units" : [128, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_3_2_layer_64_128_CNN_4_layer_FFN":{
#         "function" : model_3,
#         "arguments": {
#             "filters": [64, 128],
#             "hidden_layer_units" : [2048, 1024, 256, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_3_2_layer_32_64_CNN_4_layer_FFN":{
#         "function" : model_3,
#         "arguments": {
#             "filters": [32, 64],
#             "hidden_layer_units" : [1024, 512, 128, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_3_2_layer_16_32_CNN_4_layer_FFN":{
#         "function" : model_3,
#         "arguments": {
#             "filters": [16, 32],
#             "hidden_layer_units" : [1024, 512, 128, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_7_1_layer_transformer_encoder_2_layer_FFN":{
#         "function" : model_7,
#         "arguments": {
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_7_2_layer_transformer_encoder_2_layer_FFN":{
#         "function" : model_7,
#         "arguments": {
#             "encoder_num_heads" : [2, 2],
#             "encoder_ff_dim" : [8, 8],
#             "hidden_layer_units" : [64, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_10_1_layer_8_LSTM_1_layer_transformer_encoder_3_layer_FFN":{
#         "function" : model_10,
#         "arguments": {
#             "lstm_layer_units": [8],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [512, 256, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_10_1_layer_16_LSTM_1_layer_transformer_encoder_3_layer_FFN":{
#         "function" : model_10,
#         "arguments": {
#             "lstm_layer_units": [16],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [512, 256, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_10_2_layer_8_LSTM_1_layer_transformer_encoder_3_layer_FFN":{
#         "function" : model_10,
#         "arguments": {
#             "lstm_layer_units": [8, 8],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [512, 256, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_10_2_layer_16_LSTM_1_layer_transformer_encoder_3_layer_FFN":{
#         "function" : model_10,
#         "arguments": {
#             "lstm_layer_units": [16, 16],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [512, 256, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_10_1_layer_8_LSTM_1_layer_transformer_encoder_1_layer_8_LSTM_3_layer_FFN":{
#         "function" : model_10,
#         "arguments": {
#             "lstm_layer_units": [8],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [512, 256, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_10_1_layer_16_LSTM_1_layer_transformer_encoder_1_layer_16_LSTM_3_layer_FFN":{
#         "function" : model_10,
#         "arguments": {
#             "lstm_layer_units": [16],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [512, 256, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_13":{
#         "function" : model_13,
#         "arguments": {
#             "lstm_layer_units": [16, 16],
#             "encoder_num_heads" : [2],
#             "hidden_layer_units" : [2048, 512, 128, 32],
#             "dropout": 0.3,
#         }
#     },

#     "model_14_2_layer_64_128_CNN_SA_3_layer_FFN":{
#         "function" : model_14,
#         "arguments": {
#             "filters": [64, 128],
#             "encoder_num_heads": [2],
#             "hidden_layer_units" : [256, 64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_14_2_layer_32_64_CNN_SA_3_layer_FFN":{
#         "function" : model_14,
#         "arguments": {
#             "filters": [32, 64],
#             "encoder_num_heads": [2],
#             "hidden_layer_units" : [256, 64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_14_2_layer_16_32_CNN_SA_3_layer_FFN":{
#         "function" : model_14,
#         "arguments": {
#             "filters": [16, 32],
#             "encoder_num_heads": [2],
#             "hidden_layer_units" : [256, 64, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_15_2_layer_64_128_CNN_1_layer_transformer_encoder_4_layer_FFN":{
#         "function" : model_15,
#         "arguments": {
#             "filters": [64, 128],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [1024, 512, 128, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_15_2_layer_64_128_CNN_1_layer_transformer_encoder_8heads_4_layer_FFN":{
#         "function" : model_15,
#         "arguments": {
#             "filters": [64, 128],
#             "encoder_num_heads" : [8],
#             "encoder_ff_dim" : [16],
#             "hidden_layer_units" : [1024, 512, 128, 32],
#             "dropout": 0.3,
#         }
#     },
    
#         "model_15_2_layer_64_128_CNN_2_layer_transformer_encoder_4_layer_FFN":{
#         "function" : model_15,
#         "arguments": {
#             "filters": [64, 128],
#             "encoder_num_heads" : [2, 2],
#             "encoder_ff_dim" : [8, 8],
#             "hidden_layer_units" : [1024, 512, 128, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_15_2_layer_32_64_CNN_1_layer_transformer_encoder_4_layer_FFN":{
#         "function" : model_15,
#         "arguments": {
#             "filters": [32, 64],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [1024, 512, 128, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_15_2_layer_16_32_CNN_1_layer_transformer_encoder_4_layer_FFN":{
#         "function" : model_15,
#         "arguments": {
#             "filters": [16, 32],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [1024, 512, 128, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_17":{
#         "function": model_17,
#         "arguments":{
#             "filters": [64, 128], 
#             "lstm_layer_units": [16, 16], 
#             "encoder_num_heads": [2], 
#             "encoder_ff_dim": [8],
#             "hidden_layer_units": [1024, 512, 128, 32],
#             "dropout": 0.3,
#         }
#     }
    
#     "model_5_4_layer_CNN_3_layer_FFN":{
#         "function" : model_5,
#         "arguments": {
#             "filters" : [64, 128, 64, 32],
#             "hidden_layer_units" : [512, 256, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_4_1_layer_LSTM_2_layer_FFN":{
#         "function" : model_4,
#         "arguments": {
#             "lstm_layer_units": [32],
#             "hidden_layer_units" : [64, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_8_1_layer_transformer_encoder_2_layer_FFN":{
#         "function" : model_8,
#         "arguments": {
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_8_2_layer_transformer_encoder_2_layer_FFN":{
#         "function" : model_8,
#         "arguments": {
#             "encoder_num_heads" : [2, 2],
#             "encoder_ff_dim" : [8, 8],
#             "hidden_layer_units" : [64, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_11_1_layer_CNN_1_layer_transformer_encoder_2_layer_FFN":{
#         "function" : model_11,
#         "arguments": {
#             "filters": [64],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "hidden_layer_units" : [64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_11_1_layer_CNN_layer_transformer_encoder_2_layer_FFN":{
#         "function" : model_11,
#         "arguments": {
#             "filters": [64],
#             "encoder_num_heads" : [2, 2],
#             "encoder_ff_dim" : [8, 8],
#             "hidden_layer_units" : [64, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_6_RNA_1_layer_LSTM_DNA_4_layer_CNN_16d_3_layer_FFN":{
#         "function" : model_6,
#         "arguments": {
#             "lstm_layer_units": [8],
#             "DNA_hidden_layer_units" : [256, 64, 16],
#             "DNA_conv_filters" : [64, 128, 64, 32],
#             "hidden_layer_units" : [256, 64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_6_RNA_2_layer_LSTM_DNA_4_layer_CNN_16d_3_layer_FFN":{
#         "function" : model_6,
#         "arguments": {
#             "lstm_layer_units": [8, 8],
#             "DNA_hidden_layer_units" : [256, 64, 16],
#             "DNA_conv_filters" : [64, 128, 64, 32],
#             "hidden_layer_units" : [256, 64, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_9_RNA_DNA_4_layer_CNN_8d_3_layer_FFN":{
#         "function" : model_9,
#         "arguments": {
#             "DNA_hidden_layer_units" : [256, 64, 8],
#             "DNA_conv_filters" : [64, 128, 64, 32],
#             "hidden_layer_units" : [256, 64, 32],
#             "dropout": 0.3,
#         }
#     },
#     "model_9_RNA_DNA_4_layer_CNN_16d_3_layer_FFN":{
#         "function" : model_9,
#         "arguments": {
#             "DNA_hidden_layer_units" : [256, 64, 16],
#             "DNA_conv_filters" : [64, 128, 64, 32],
#             "hidden_layer_units" : [256, 64, 32],
#             "dropout": 0.3,
#         }
#     },
#    "model_9_RNA_DNA_4_layer_CNN_32d_3_layer_FFN":{
#         "function" : model_9,
#         "arguments": {
#             "DNA_hidden_layer_units" : [256, 64, 32],
#             "DNA_conv_filters" : [64, 128, 64, 32],
#             "hidden_layer_units" : [256, 64, 32],
#             "dropout": 0.3,
#         }
#     },
    
#     "model_12_RNA_1_layer_8_LSTM_1_layer_transformer_encoder_1_layer_8_LSTM_DNA_4_layer_CNN_16d_3_layer_FFN":{
#         "function" : model_12,
#         "arguments": {
#             "lstm_layer_units": [8], 
#             "encoder_num_heads": [2],
#             "encoder_ff_dim": [8],

#             "DNA_hidden_layer_units": [256, 64, 16], 
#             "DNA_conv_filters": [64, 128, 64, 32],

#             "hidden_layer_units": [256, 64, 32], 
#             "dropout": 0.3,
#         }
#     },
    
#     "model_16_RNA_2_layer_64_128_CNN_1_layer_transformer_encoder_DNA_4_layer_CNN_4_layer_FFN":{
#         "function" : model_16,
#         "arguments": {
#             "filters": [64, 128],
#             "encoder_num_heads" : [2],
#             "encoder_ff_dim" : [8],
#             "DNA_conv_filters": [64, 128, 64, 32],
#             "hidden_layer_units" : [1024, 512, 128, 32],
#             "dropout": 0.3,
#         }
#     },
}


In [None]:
models_to_train

In [None]:
run_experiment_on_model(models_to_train, repeat = 10)

result_df = consolidate_result(models_to_train)
result_df.to_csv("/kaggle/working/output_" + time.strftime("%Y%m%d_%H%M%S",time.localtime()) + ".csv")
result_df