In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.utils import resample

from google.colab import drive
drive.mount('/content/drive')

from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.models import Sequential, Model

def DNA_embedding_layer():
    vocab_size = 5
    emb_dim = 4

    # Initialize the embedding matrix as a numpy array of zeros.
    emb_matrix = np.matrix([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0.25, 0.25, 0.25, 0.25]])

    # Define Keras embedding layer with the correct input and output sizes
    embedding_layer = layers.Embedding(vocab_size, emb_dim, trainable = False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer.
    embedding_layer.build((None,))

    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])

    return embedding_layer

# https://keras.io/examples/nlp/text_classification_with_transformer/
@keras.saving.register_keras_serializable(package="MyLayers", name = "TransformerEncoderBlock")
class TransformerEncoderBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

        self.att = layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.embed_dim)
        self.ffn = Sequential(
            [layers.Dense(self.ff_dim, activation="relu"), layers.Dense(self.embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(self.rate)
        self.dropout2 = layers.Dropout(self.rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        return {
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate
        }


@keras.saving.register_keras_serializable(package="MyLayers", name = "PositionEmbedding")
class PositionEmbedding(layers.Layer):
    def __init__(self, maxlen, embed_dim):
        super().__init__()
        self.maxlen = maxlen
        self.embed_dim = embed_dim
        self.pos_emb = layers.Embedding(input_dim=self.maxlen, output_dim=self.embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-2]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions

    def get_config(self):
        return {
            "maxlen": self.maxlen,
            "embed_dim": self.embed_dim
        }

@keras.saving.register_keras_serializable(package="MyLayers", name = "TokenAndPositionEmbedding")
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, embed_dim):
        super().__init__()
        self.token_emb = DNA_embedding_layer()
        self.maxlen = maxlen
        self.embed_dim = embed_dim
        self.pos_emb = layers.Embedding(input_dim=self.maxlen, output_dim=self.embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        return {
            "maxlen": self.maxlen,
            "embed_dim": self.embed_dim
        }


Mounted at /content/drive


In [None]:
RNA_GE_window_size = 200
RNA_GE_bp_length = 20000    # adjust
RNA_GE_feature_length = RNA_GE_bp_length // RNA_GE_window_size

DNA_feature_bp_length = 4000

RNA_GE_left_columns_to_include = ['L' + str(i) for i in range(100 - RNA_GE_feature_length//2, 100 + RNA_GE_feature_length//2)]
RNA_GE_right_columns_to_include = ['R' + str(i) for i in range(100 - RNA_GE_feature_length//2, 100 + RNA_GE_feature_length//2)]

print(RNA_GE_left_columns_to_include)
print(RNA_GE_right_columns_to_include)

all_datasets = ["HelaS3", "K562", "IMR90","GM12878"]

def load_dataset(name:str):
  # For fuying dataset
  dataset_paths = {
      "HelaS3": "/content/drive/MyDrive/FYP/data_fuying/fuying_HelaS3_out_fea_DNA.csv",
      "K562": "/content/drive/MyDrive/FYP/data_fuying/fuying_K562_out_fea_DNA.csv",
      "IMR90": "/content/drive/MyDrive/FYP/data_fuying/fuying_IMR90_out_fea_DNA.csv",
      "GM12878": "/content/drive/MyDrive/FYP/data_fuying/fuying_GM12878_out_fea_DNA.csv"
  }

  dataset_RNA_20kbp_paths = {
      "HelaS3": "/content/drive/MyDrive/FYP/data_fuying/fuying_HelaS3_out_fea_20kbp.csv",
      "K562": "/content/drive/MyDrive/FYP/data_fuying/fuying_K562_out_fea_20kbp.csv",
      "IMR90": "/content/drive/MyDrive/FYP/data_fuying/fuying_IMR90_out_fea_20kbp.csv",
      "GM12878": "/content/drive/MyDrive/FYP/data_fuying/fuying_GM12878_out_fea_20kbp.csv"
  }

  if name not in dataset_paths.keys():
    print(f"Dataset {name} not found.")
    return None

  DNA_feature_columns = pd.read_csv(dataset_paths[name])[['left_anchor_sequence', 'right_anchor_sequence']]
  RNA_dataframe = pd.read_csv(dataset_RNA_20kbp_paths[name])
  dataframe = pd.concat([RNA_dataframe, DNA_feature_columns], axis = 1)

  def downsample(data):
    positive = data[data['label'] == 1]
    negative = data[data['label'] == 0]

    negative_downsampled = resample(negative,
              replace=True,
              n_samples=len(positive),
              random_state=42)

    return pd.concat([positive, negative_downsampled]).reset_index(drop = True)

  print(f"\n{name} loaded")
  neg, pos = np.bincount(dataframe["label"])
  total = neg + pos
  baseline_auprc = pos / total
  print(f"Baseline auprc: {baseline_auprc}")

  return dataframe

def DNA_sequence_to_indices(sequence):
    try:
      sequence = sequence.upper()
    except:
      print(sequence)

    if len(sequence) < 4000:
        sequence = sequence + "N" * (4000 - len(sequence))
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    indices = [mapping.get(i, 4) for i in sequence]
    return indices

def evaluate_model_on_other_datasets(model_data_name, dataset_names = ["HelaS3", "K562", "IMR90","GM12878"]):
  model = tf.keras.models.load_model(f"/content/drive/MyDrive/FYP/models/chromsplit/{model_data_name}_model_15_2_layer_64_128_CNN_1_layer_transformer_encoder_4_layer_FFN.hdf5")

  auprc_scores = {}

  for dataset_name in dataset_names:
    data = load_dataset(dataset_name)

    X_RNA_left = np.array(data[RNA_GE_left_columns_to_include])
    X_RNA_right = np.array(data[RNA_GE_right_columns_to_include])

    X_DNA_left = np.array([DNA_sequence_to_indices(sequence) for sequence in data['left_anchor_sequence']])
    X_DNA_right = np.array([DNA_sequence_to_indices(sequence) for sequence in data['right_anchor_sequence']])

    y = data['label'].astype(np.float32)

    result = model.evaluate([[X_RNA_left, X_RNA_right, X_DNA_left, X_DNA_right]], y, batch_size = 32)

    auprc_scores[dataset_name] = result[-1]

  return auprc_scores

['L50', 'L51', 'L52', 'L53', 'L54', 'L55', 'L56', 'L57', 'L58', 'L59', 'L60', 'L61', 'L62', 'L63', 'L64', 'L65', 'L66', 'L67', 'L68', 'L69', 'L70', 'L71', 'L72', 'L73', 'L74', 'L75', 'L76', 'L77', 'L78', 'L79', 'L80', 'L81', 'L82', 'L83', 'L84', 'L85', 'L86', 'L87', 'L88', 'L89', 'L90', 'L91', 'L92', 'L93', 'L94', 'L95', 'L96', 'L97', 'L98', 'L99', 'L100', 'L101', 'L102', 'L103', 'L104', 'L105', 'L106', 'L107', 'L108', 'L109', 'L110', 'L111', 'L112', 'L113', 'L114', 'L115', 'L116', 'L117', 'L118', 'L119', 'L120', 'L121', 'L122', 'L123', 'L124', 'L125', 'L126', 'L127', 'L128', 'L129', 'L130', 'L131', 'L132', 'L133', 'L134', 'L135', 'L136', 'L137', 'L138', 'L139', 'L140', 'L141', 'L142', 'L143', 'L144', 'L145', 'L146', 'L147', 'L148', 'L149']
['R50', 'R51', 'R52', 'R53', 'R54', 'R55', 'R56', 'R57', 'R58', 'R59', 'R60', 'R61', 'R62', 'R63', 'R64', 'R65', 'R66', 'R67', 'R68', 'R69', 'R70', 'R71', 'R72', 'R73', 'R74', 'R75', 'R76', 'R77', 'R78', 'R79', 'R80', 'R81', 'R82', 'R83', 'R84', 'R8

In [None]:
all_results = {}
models = ["HelaS3", "K562", "IMR90","GM12878"]
for model in models:
  print(f"\nModel: {model}")
  all_results[model + ' model'] = evaluate_model_on_other_datasets(model)

pd.DataFrame(all_results)


Model: HelaS3

HelaS3 loaded
Baseline auprc: 0.16666666666666666

K562 loaded
Baseline auprc: 0.16689905186837703

IMR90 loaded
Baseline auprc: 0.1670324846356453

GM12878 loaded
Baseline auprc: 0.16666666666666666

Model: K562

HelaS3 loaded
Baseline auprc: 0.16666666666666666

K562 loaded
Baseline auprc: 0.16689905186837703

IMR90 loaded
Baseline auprc: 0.1670324846356453

GM12878 loaded
Baseline auprc: 0.16666666666666666

Model: IMR90

HelaS3 loaded
Baseline auprc: 0.16666666666666666

K562 loaded
Baseline auprc: 0.16689905186837703

IMR90 loaded
Baseline auprc: 0.1670324846356453

GM12878 loaded
Baseline auprc: 0.16666666666666666

Model: GM12878

HelaS3 loaded
Baseline auprc: 0.16666666666666666

K562 loaded
Baseline auprc: 0.16689905186837703

IMR90 loaded
Baseline auprc: 0.1670324846356453

GM12878 loaded
Baseline auprc: 0.16666666666666666


Unnamed: 0,HelaS3 model,K562 model,IMR90 model,GM12878 model
HelaS3,0.558173,0.558117,0.537455,0.525788
K562,0.512441,0.599052,0.527675,0.510423
IMR90,0.546815,0.599787,0.78598,0.637872
GM12878,0.54609,0.553657,0.584476,0.745384


In [None]:
df = pd.DataFrame(all_results)
df

Unnamed: 0,HelaS3 model,K562 model,IMR90 model,GM12878 model
HelaS3,0.558173,0.558117,0.537455,0.525788
K562,0.512441,0.599052,0.527675,0.510423
IMR90,0.546815,0.599787,0.78598,0.637872
GM12878,0.54609,0.553657,0.584476,0.745384


In [None]:
!pip install -U kaleido
import plotly.express as px
import plotly.graph_objects as go

fig = px.imshow(
    df.round(4),
    text_auto=True,
    labels=dict(x="Model", y="Dataset", color="AuPRC")
    )
fig.show()
fig.write_image(f"Model 15 cross sample validation heatmap.png")