This code simulates from using a Python package called `msprime` for simulating genetic sequences:

In [2]:
import msprime as msp
L = 1_000_000
sim = msp.simulate(sample_size=2, length=L, mutation_rate=1e-8, recombination_rate=1e-9, Ne=1e4)

This simulates two chromosomes, so that they "sequence" can be encoded by 0 or 1 depending on whether they are the same or different at each position.

In [4]:
import numpy as np
sim = msp.simulate(sample_size=2, length=L, mutation_rate=1e-8, recombination_rate=1e-9, Ne=1e4)
seq = np.zeros(L, dtype='u1')
positions = np.array([v.position for v in sim.variants()])
seq[positions.astype(int)] = 1
gaps = np.diff(positions)

In [21]:
import numpy as np
sim = msp.simulate(sample_size=2, length=L, mutation_rate=1e-8, recombination_rate=1e-9, Ne=1e4)
seq = np.zeros(L, dtype='u1')
positions = np.array([v.position for v in sim.variants()])
seq[positions.astype(int)] = 1

`seq` is a sequence of mostly zeros, and some ones wherever a difference (mutation) occurred:

In [23]:
seq[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

To generate such a sequence using a neural net, I think the simplest approach would be to train an LSTM to emit a sequence of numbers corresponding to the gaps between each variant:

In [137]:
import numpy as np
x = []
y = []
i = 0
while i < 500:
    L = 1_000_000
    sim = msp.simulate(sample_size=2, length=L, mutation_rate=1e-8, recombination_rate=1e-9, Ne=1e4)
    positions = np.array([v.position for v in sim.variants()])
    gaps = np.diff(positions)
    if len(gaps) <= 51:
        continue
    x.append(gaps[:50])
    y.append(gaps[50])
    i += 1
    
x=np.array(x)
y=np.array(y)
x=x.astype(int)
y=y.astype(int)

#flatten x to compute number of unique numbers
flattened = []
for sublist in x:
    for val in sublist:
        flattened.append(val)
        
unique_num = max(max(set(y)), max(set(flattened)))+1
x_train = np.zeros((len(x), len(x[0]), unique_num), dtype=np.bool)
y_train = np.zeros((len(x), unique_num), dtype=np.bool)
for i, data in enumerate(x):
    for t, char in enumerate(data):
        x_train[i,t,char] = 1
    y_train[i,y[i]] = 1
print("x_train shape:",x_train.shape)
print("y_train shape:",y_train.shape)


x_train shape: (500, 50, 242372)
y_train shape: (500, 242372)


In [142]:
import keras
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(50, unique_num))) 
model.add(layers.Dense(unique_num, activation='softmax'))
optimizer = keras.optimizers.RMSprop(lr=0.01) 
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()

ModuleNotFoundError: No module named 'tensorflow.contrib'

<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x87ab805f8>>


In [139]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [140]:
import random
for epoch in range(1,60):
    print('epoch',epoch)
    model.fit(x_train,y_train,batch_size=1,epochs=1)
    #select a slice a data at random
    start_index=random.randint(0,len(flattened)-50-1)
    generated_text=flattened[start_index:start_index+50]
    
    for i in range(50):
        sampled = np.zeros((1, 50, unique_num), dtype=np.bool)
        for t, char in enumerate(generated_text):
            sampled[0,t,char] = 1
        
        preds = model.predict(sampled,verbose=0)[0]
        next_index=sample(preds, 0.5)
        next_char=chars[next_index]
        
        generated_text+=next_char
        generated_text=generated_text[1:]
        
        print(next_char)

epoch 1
Epoch 1/1
  7/500 [..............................] - ETA: 3:44:28 - loss: 12.4058

KeyboardInterrupt: 

In [121]:
start_index=random.randint(0,len(flattened)-50-1)
generated_text=flattened[start_index:start_index+50]
len(generated_text)
sampled = np.zeros((1, 50, unique_num), dtype=np.bool)
generated_text

[8492,
 960,
 1311,
 212,
 2159,
 2313,
 840,
 415,
 341,
 502,
 1723,
 1626,
 1912,
 1783,
 91,
 1688,
 3599,
 2678,
 655,
 5,
 66238,
 47248,
 216487,
 35431,
 149119,
 267,
 29,
 3357,
 390,
 611,
 310,
 981,
 30,
 1780,
 2159,
 600,
 821,
 104,
 2112,
 37,
 550,
 10920,
 16,
 2704,
 772,
 795,
 556,
 1945,
 6826,
 460]

In [124]:
for t, char in enumerate(generated_text):
    print(t,char)
    sampled[0, t, char] = 1.
    print(sampled[0, t, char])

0 8492
False
1 960
False
2 1311
False
3 212
False
4 2159
False
5 2313
False
6 840
False
7 415
False
8 341
False
9 502
False
10 1723
False
11 1626
False
12 1912
False
13 1783
False
14 91
False
15 1688
False
16 3599
False
17 2678
False
18 655
False
19 5
False
20 66238
False
21 47248
False
22 216487
False
23 35431
False
24 149119
False
25 267
False
26 29
False
27 3357
False
28 390
False
29 611
False
30 310
False
31 981
False
32 30
False
33 1780
False
34 2159
False
35 600
False
36 821
False
37 104
False
38 2112
False
39 37
False
40 550
False
41 10920
False
42 16
False
43 2704
False
44 772
False
45 795
False
46 556
False
47 1945
False
48 6826
False
49 460
False


In [43]:
from keras import layers

model = keras.models.Sequential()
model.add(layers.Embedding(input_dim=vocab_size,output_dim=8,input_length=50))
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars)))) 
model.add(layers.Dense(len(chars), activation='softmax'))
optimizer = keras.optimizers.RMSprop(lr=0.01) 
model.compile(loss='mean_squared_error', optimizer=optimizer)

In [36]:
len(x[0])

50

IndexError: index 19985 is out of bounds for axis 2 with size 8482

In [40]:
x_train.shape

(500, 50, 8482)

In [44]:
from keras import layers
model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars)))) 
model.add(layers.Dense(len(chars), activation='softmax'))
optimizer = keras.optimizers.RMSprop(lr=0.01) 
model.compile(loss='mean_squared_error', optimizer=optimizer)

<ol>
    <li> Draw from the model a probability distribution for the next character, given the generated text available so far.</li>
    <li>Reweight the distribution to a certain temperature.</li>
    <li>Sample the next character at random according to the reweighted distribution.</li>
    <li>Add the new character at the end of the available text.</li>
</ol>

In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64') 
    preds = np.log(preds) / temperature 
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds) 
    probas = np.random.multinomial(1, preds, 1) 
    return np.argmax(probas)

for epoch in range(1, 60):
    print('epoch', epoch)
    model.fit(x, y, batch_size=10, epochs=1)
    start_index = random.randint(0, len(text) - maxlen - 1) 
    generated_text = text[start_index: start_index + maxlen] 
    print('--- Generating with seed: "' + generated_text + '"')
    
    for i in range(400):
        sampled = np.zeros((1, 50, 1)) 
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1.
            
        preds = model.predict(sampled, verbose=0)[0] 
        next_index = sample(preds, temperature) 
        next_char = chars[next_index]
        
        generated_text += next_char
        generated_text = generated_text[1:]
        sys.stdout.write(next_char)

Intend to model a pair of input: the *seq* and *gaps* 

we need to do padding and mask for preprocessing the data

In [236]:
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Masking, Input, TimeDistributed
samples, timestpes, features = 5000, 1, 962

def seq2seq_model(HIDDEN_DIM = 300):
    
    encoder_inputs = Input(shape = (timestpes, features))
    encoder_LSTM =  LSTM(HIDDEN_DIM, return_state = True)
    encoder_outputs, state_h, state_c = encoder_LSTM(encoder_inputs)
    
    decoder_inputs = Input(shape = (timestpes, features))
    decoder_LSTM = LSTM(HIDDEN_DIM, return_state = True, return_sequences = True)
    decoder_ouputs, _, _, = decoder_LSTM(decoder_inputs, initial_state = [state_h, state_c])
    
    outpus = TimeDistributed(Dense(962, activation = 'softmax'))(decoder_ouputs)
    model = Model([encoder_inputs, decoder_inputs], outpus)

model = seq2seq_model(HIDDEN_DIM = 300)
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics  = ['accuracy'])

AttributeError: 'NoneType' object has no attribute 'compile'

In [217]:
pad_input.shape

(5000, 962)

In [238]:
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Masking

def get_samples():
    sim = msp.simulate(sample_size=2, length=L, mutation_rate=1e-8, recombination_rate=1e-9, Ne=1e4)
    seq = np.zeros(L, dtype='u1')
    positions = np.array([v.position for v in sim.variants()])
    seq[positions.astype(int)] = 1
    gaps = np.diff(positions)
    length = len(gaps)
    gaps.reshape(length,1)




#timest
samples, timesteps, features = 5000, 1, 962

model = Sequential()
model.add(LSTM(300, input_shape = (timesteps, features)))
model.add(Dense(962, activation = 'softmax'))
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()


def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_17 (LSTM)               (None, 300)               1515600   
_________________________________________________________________
dense_6 (Dense)              (None, 962)               289562    
Total params: 1,805,162
Trainable params: 1,805,162
Non-trainable params: 0
_________________________________________________________________


In [8]:
import keras
import numpy as np
path = keras.utils.get_file( 'nietzsche.txt',
origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt') 
text = open(path).read().lower()
print('Corpus length:', len(text))

maxlen = 60
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen]) 
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))

chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
char_indices = dict((char, chars.index(char)) for char in chars)
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) 
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Using TensorFlow backend.


Corpus length: 600893
Number of sequences: 200278
Unique characters: 57
Vectorization...


In [45]:
len(text)

600893

In [19]:
y.shape

(200278, 57)