In [11]:
# !wget https://raw.githubusercontent.com/kundajelab/deeplift/d95d41d/examples/genomics/grab_model_and_data.sh
# !chmod a+x grab_model_and_data.sh
# !./grab_model_and_data.sh

In [3]:
from __future__ import print_function
import tensorflow
print("Tensorflow version:", tensorflow.__version__)
import keras
print("Keras version:", keras.__version__)
import numpy as np
print("Numpy version:", np.__version__)
import simdna

Tensorflow version: 1.15.0
Keras version: 2.2.4
Numpy version: 1.21.2


In [4]:
#this is set up for 1d convolutions where examples
#have dimensions (len, num_channels)
#the channel axis is the axis for one-hot encoding.
def one_hot_encode_along_channel_axis(sequence):
    to_return = np.zeros((len(sequence),4), dtype=np.int8)
    seq_to_one_hot_fill_in_array(zeros_array=to_return,
                                 sequence=sequence, one_hot_axis=1)
    return to_return

def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1): 
        assert zeros_array.shape[0] == len(sequence)
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if (char=="A" or char=="a"):
            char_idx = 0
        elif (char=="C" or char=="c"):
            char_idx = 1
        elif (char=="G" or char=="g"):
            char_idx = 2
        elif (char=="T" or char=="t"):
            char_idx = 3
        elif (char=="N" or char=="n"):
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1

In [5]:
def check(motif):
    def fn(embeddings):
        for embedding in embeddings:
            if motif in embedding.what.getDescription():
                return True
        return False
    return fn

def check_and(m1, m2):
    def fn(embeddings):
        return check(m1)(embeddings) and check(m2)(embeddings)
    return fn

def check_or(m1, m2):
    def fn(embeddings):
        return check(m1)(embeddings) or check(m2)(embeddings)
    return fn

In [6]:
def load_data(positive_fn, file_path):
    import numpy as np
    with open(file_path, 'rb') as f:
        data_out = np.load(f, allow_pickle='True')

    embeddings = [data.embeddings for data in data_out]
    sequences = [data.seq for data in data_out]
    labels = np.array([[1, 1, 1] if positive_fn(embedding) else [0, 0, 0] for embedding in embeddings])
    onehot = np.array([one_hot_encode_along_channel_axis(seq) for seq in sequences])

    data = simdna.util.enum(embeddings=embeddings, sequences=sequences, labels=labels, onehot=onehot)

    return data

In [7]:
def reset_weights(model):
    import keras.backend as K
    session = K.get_session()
    for layer in model.layers: 
        if hasattr(layer, 'kernel_initializer'): 
            layer.kernel.initializer.run(session=session)
        if hasattr(layer, 'bias_initializer'):
            layer.bias.initializer.run(session=session)     

In [8]:
def load_trained_model(model_weights_path, model_json_path):
    import deeplift
    from keras.models import model_from_json

    keras_model = model_from_json(open(model_json_path).read())
    keras_model.load_weights(model_weights_path)
    
    # reset_weights(keras_model)
    return keras_model

In [9]:
SIX5_disc1 = 'SIX5_disc1'
MYC_disc1 = 'MYC_disc1'
SRF_disc1 = 'SRF_disc1'
AP1_disc1 = 'AP1_disc1'
GATA_disc1 = 'GATA_disc1'
TAL1_known1 = 'TAL1_known1'
IRF_known1 = 'IRF_known1'

all_motifs = [SRF_disc1, AP1_disc1, GATA_disc1, TAL1_known1, IRF_known1]