In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from __future__ import print_function
import tensorflow
print("Tensorflow version:", tensorflow.__version__)
import keras
print("Keras version:", keras.__version__)
import numpy as np
print("Numpy version:", np.__version__)
from simdna.simdnautil import util

In [None]:
#this is set up for 1d convolutions where examples
#have dimensions (len, num_channels)
#the channel axis is the axis for one-hot encoding.
def one_hot_encode_along_channel_axis(sequence):
    to_return = np.zeros((len(sequence),4), dtype=np.int8)
    seq_to_one_hot_fill_in_array(zeros_array=to_return,
                                 sequence=sequence, one_hot_axis=1)
    return to_return

def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1): 
        assert zeros_array.shape[0] == len(sequence)
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if (char=="A" or char=="a"):
            char_idx = 0
        elif (char=="C" or char=="c"):
            char_idx = 1
        elif (char=="G" or char=="g"):
            char_idx = 2
        elif (char=="T" or char=="t"):
            char_idx = 3
        elif (char=="N" or char=="n"):
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1

In [None]:
def contain_gata(embeddings):
    for embedding in embeddings:
        return 'GATA_disc1' in embedding.what.getDescription()
    return False

In [None]:
def contain_tal1(embeddings):
    for embedding in embeddings:
        return 'TAL1_known1' in embedding.what.getDescription()
    return False

In [None]:
def load_training_data():
    import numpy as np
    with open('training.npy', 'rb') as f:
        data_out = np.load(f, allow_pickle='True')

    embeddings = [data.embeddings for data in data_out]
    sequences = [data.seq for data in data_out]
    labels = np.array([[1, 1, 1] if contain_gata(embedding) else [0, 0, 0] for embedding in embeddings])
    onehot = np.array([one_hot_encode_along_channel_axis(seq) for seq in sequences])
    
    data = util.enum(embeddings=embeddings, sequences=sequences, labels=labels, onehot=onehot)

    return data

In [None]:
def load_testing_data():
    import numpy as np
    with open('testing.npy', 'rb') as f:
        data_out = np.load(f, allow_pickle='True')

    embeddings = [data.embeddings for data in data_out]
    sequences = [data.seq for data in data_out]
    labels = np.array([[1, 1, 1] if contain_gata(embedding) else [0, 0, 0] for embedding in embeddings])
    onehot = np.array([one_hot_encode_along_channel_axis(seq) for seq in sequences])
    
    data = util.enum(embeddings=embeddings, sequences=sequences, labels=labels, onehot=onehot)

    return data

In [None]:
def load_trained_model():
    import deeplift
    from keras.models import model_from_json

    keras_model_weights = "model.h5"
    keras_model_json = "model.json"

    keras_model = model_from_json(open(keras_model_json).read())
    keras_model.load_weights(keras_model_weights)
    
    return keras_model, keras_model_weights, keras_model_json