In [2]:
%run common.ipynb

Tensorflow version: 1.15.0
Keras version: 2.2.4
Numpy version: 1.21.2


Using TensorFlow backend.


In [3]:
import simdna
from simdna import synthetic as sn
import numpy as np
from collections import defaultdict

In [4]:
SEQUENCE_LENGTH = 200
SAMPLE_COUNT = 10000
BACKGROUND_DISTRIBUTION = {'A':0.3, 'C':0.2, 'G':0.2, 'T':0.3}

background = sn.ZeroOrderBackgroundGenerator(SEQUENCE_LENGTH, discreteDistribution=BACKGROUND_DISTRIBUTION)

# position generator
position_generator = sn.UniformPositionGenerator()

In [5]:
SIX5_disc1 = 'SIX5_disc1'
MYC_disc1 = 'MYC_disc1'
SRF_disc1 = 'SRF_disc1'
AP1_disc1 = 'AP1_disc1'
GATA_disc1 = 'GATA_disc1'
TAL1_known1 = 'TAL1_known1'
IRF_known1 = 'IRF_known1'

all_motifs = [
#  SIX5_disc1, MYC_disc1, 
 SRF_disc1, AP1_disc1, GATA_disc1, TAL1_known1, IRF_known1
]

In [6]:
data_bundles = []
for i in range(len(all_motifs)):
    for j in range(i + 1, len(all_motifs)):
        data_bundles.append([[], [all_motifs[i]], [all_motifs[j]], [all_motifs[i], all_motifs[j]]])

In [7]:
def get_quantity_generator(count):
    if count == 1:
        return sn.MinMaxWrapper(sn.PoissonQuantityGenerator(3), theMin=1, theMax=3)
    elif count == 2:
        return sn.MinMaxWrapper(sn.PoissonQuantityGenerator(1.5), theMin=1, theMax=3)
    elif count == 3:
        return sn.MinMaxWrapper(sn.PoissonQuantityGenerator(1), theMin=1, theMax=3)
    else:
        return 0

In [8]:
loaded_motifs = sn.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)

In [9]:
def data_gen(sample_count, data_bundle):
    data_out = []
    for motifs in data_bundle:
        embedders = [
            sn.RepeatedEmbedder(
                sn.SubstringEmbedder(
                    sn.PwmSamplerFromLoadedMotifs(loaded_motifs, motif_name),
                    position_generator, 
                    name=motif_name
                    ),
                get_quantity_generator(len(motifs))
                ) for motif_name in motifs]

        embedder = sn.EmbedInABackground(background, embedders)
        embedded = list(sn.GenerateSequenceNTimes(embedder, sample_count).generateSequences())
        data_out = data_out + embedded
    return data_out

In [None]:
for i in range(len(all_motifs)):
    for j in range(i + 1, len(all_motifs)):
        data_bundle = [[], [all_motifs[i]], [all_motifs[j]], [all_motifs[i], all_motifs[j]]]
        training_data = data_gen(10000, data_bundle)
        with open(mount_path + 'data/training_' + str(i) + '_' + str(j) + '.npy', 'wb') as f:
            np.save(f, training_data)
        testing_data = data_gen(200, data_bundle)
        with open(mount_path + 'data/testing_' + str(i) + '_' + str(j) + '.npy', 'wb') as f:
            np.save(f, testing_data)


