In [1]:
import simdna
from simdna import synthetic as sn
import numpy as np
from collections import defaultdict

In [2]:
SEQUENCE_LENGTH = 200
SAMPLE_COUNT = 10000
BACKGROUND_DISTRIBUTION = {'A':0.3, 'C':0.2, 'G':0.2, 'T':0.3}

background = sn.ZeroOrderBackgroundGenerator(SEQUENCE_LENGTH, discreteDistribution=BACKGROUND_DISTRIBUTION)

# position generator
position_generator = sn.UniformPositionGenerator()

In [3]:
data_bundle = [["GATA_disc1"], ["TAL1_known1"], ["GATA_disc1", "TAL1_known1"], []]

# motif_names = ["GATA_disc1", "TAL1_known1"] 

In [4]:
def get_quantity_generator(count):
    if count == 1:
        return sn.MinMaxWrapper(sn.PoissonQuantityGenerator(2), theMin=1, theMax=3)
    elif count == 2:
        return sn.MinMaxWrapper(sn.PoissonQuantityGenerator(1), theMin=1, theMax=3)
    else:
        return 0

In [5]:
loaded_motifs = sn.LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)

In [6]:
data_out = []
for motifs in data_bundle:
    embedders = [
        sn.RepeatedEmbedder(
            sn.SubstringEmbedder(
                sn.PwmSamplerFromLoadedMotifs(loaded_motifs, motif_name),
                position_generator, 
                name=motif_name
            ),
            get_quantity_generator(len(motifs))
        ) for motif_name in motifs]
    
    embedder = sn.EmbedInABackground(background, embedders)
    embedded = list(sn.GenerateSequenceNTimes(embedder, SAMPLE_COUNT).generateSequences())
    data_out = data_out + embedded

In [7]:
with open('training.npy', 'wb') as f:
    np.save(f, data_out)