These are the packages needed for the synthetic sequences

In [1]:
# Thor H. Jonsson
import tensorflow as tf
import numpy as np
import pandas as pd
import random

To make sure that our RNN can solve simple tasks we introduce a synthetic dataset inspired by the fibonacci sequence

Each sequence in this dataset can be described by the recursive relation
x_i =  x_(i-1) + x_(i-2) mod basis_len 
where x_0 and x_1 are random initializations in the group of residues 
The different combinations of initial values available is basis_len^2 

This allows us to tune the size of our synthetic dataset. It gives the residue classes which we want to classify for.

In [16]:
basis_len = 10

In [18]:
def fibonacci(i, max_steps=10):
    '''
    Generates a single sequence according to the fibonacci relation residue basis_len as discussed above
    
    '''
    j = np.random.randint(basis_len)
    seq = [i,j]
    steps = 2 
    while steps < max_steps:
        r = (seq[steps-2]+seq[steps-1]) % basis_len
        seq.append(r)
        steps += 1
    return seq

def make_fibonacci_data(size=50):
    x = np.random.randint(basis_len,size=size)
    df = pd.DataFrame(x)
    df['fibonacci'] = df[0].apply(fibonacci)
    return df 

Lets try generating a sample from this synthetic distribution

In [19]:
print fibonacci(4)

[4, 4, 8, 2, 0, 2, 2, 4, 6, 0]


In [20]:
print make_fibonacci_data()

    0                       fibonacci
0   7  [7, 5, 2, 7, 9, 6, 5, 1, 6, 7]
1   7  [7, 9, 6, 5, 1, 6, 7, 3, 0, 3]
2   4  [4, 8, 2, 0, 2, 2, 4, 6, 0, 6]
3   6  [6, 0, 6, 6, 2, 8, 0, 8, 8, 6]
4   3  [3, 8, 1, 9, 0, 9, 9, 8, 7, 5]
5   9  [9, 1, 0, 1, 1, 2, 3, 5, 8, 3]
6   9  [9, 7, 6, 3, 9, 2, 1, 3, 4, 7]
7   2  [2, 6, 8, 4, 2, 6, 8, 4, 2, 6]
8   9  [9, 5, 4, 9, 3, 2, 5, 7, 2, 9]
9   1  [1, 9, 0, 9, 9, 8, 7, 5, 2, 7]
10  1  [1, 0, 1, 1, 2, 3, 5, 8, 3, 1]
11  7  [7, 6, 3, 9, 2, 1, 3, 4, 7, 1]
12  7  [7, 8, 5, 3, 8, 1, 9, 0, 9, 9]
13  3  [3, 5, 8, 3, 1, 4, 5, 9, 4, 3]
14  7  [7, 1, 8, 9, 7, 6, 3, 9, 2, 1]
15  2  [2, 0, 2, 2, 4, 6, 0, 6, 6, 2]
16  5  [5, 2, 7, 9, 6, 5, 1, 6, 7, 3]
17  8  [8, 2, 0, 2, 2, 4, 6, 0, 6, 6]
18  7  [7, 2, 9, 1, 0, 1, 1, 2, 3, 5]
19  6  [6, 2, 8, 0, 8, 8, 6, 4, 0, 4]
20  9  [9, 1, 0, 1, 1, 2, 3, 5, 8, 3]
21  4  [4, 3, 7, 0, 7, 7, 4, 1, 5, 6]
22  6  [6, 0, 6, 6, 2, 8, 0, 8, 8, 6]
23  3  [3, 2, 5, 7, 2, 9, 1, 0, 1, 1]
24  7  [7, 1, 8, 9, 7, 6, 3, 9, 2, 1]
25  5  [5, 8

Now we have a dataset of 50 sequences. Lets make an iterator for this dataset.

We will initalize the iterator by giving it a list of 50 sequences, this conversion can be obtained by using pandas

In [21]:
# Simple bucket sequence iterator
class BucketSequenceIterator(object):
    # This class initializes a generator for a list of sequences
    # The generator uses buckets for speed
    def __init__(self, sequences, num_buckets = 5):
        self.sequences = sorted(sequences, key=lambda sequence: len(sequence))
        self.size = int(len(self.sequences)/num_buckets)
        self.bucket_data = []
        # Put the shortest sequences in the first bucket etc
        # bucket_data is a list of 'buckets' where each bucket is a list of Sentences.
        for bucket in range(num_buckets):
            self.bucket_data.append(self.sequences[bucket*self.size: (bucket+1)*self.size -1])
        self.num_buckets = num_buckets
        self.cursor = np.array([0]*num_buckets)
        self.shuffle()
        self.epochs = 0

    def shuffle(self):
        #sorts dataframe by sequence length, but keeps it random within the same length
        for i in range(self.num_buckets):
            random.shuffle(self.bucket_data[i])
            self.cursor[i] = 0

    def next_batch(self, n):
        # if any of the buckets is full go to next epoch
        if np.any(self.cursor+n+1 > self.size):
            self.epochs += 1
            self.shuffle()

        i = np.random.randint(0,self.num_buckets)
        batch_seq = self.bucket_data[i][self.cursor[i]:self.cursor[i]+n]
        batch_len = [len(s) for s in batch_seq]
        self.cursor[i] += n        
        maxlen = max(batch_len) + 1

        # Pad sequences with 0s so they are all the same length
        #### INPUT
        x = np.zeros([n, maxlen+1], dtype=np.int32)
        for i, x_i in enumerate(x):
            x_i[0:batch_len[i]] = batch_seq[i]
            
        #### OUTPUT - 'simply' the input shifted by 1
        y = np.zeros([n, maxlen+1], dtype=np.int32)
        for i, y_i in enumerate(y):
            y_i[1:batch_len[i]] = x[i][2:batch_len[i]+1]
        
        return x, y, batch_len


In [22]:
df = make_fibonacci_data()
X = BucketSequenceIterator(df.fibonacci.tolist())

In [27]:
x,y,_ = X.next_batch(4)

In [28]:
print (x[1], y[1])

(array([9, 0, 9, 9, 8, 7, 5, 2, 7, 9, 0, 0], dtype=int32), array([0, 9, 9, 8, 7, 5, 2, 7, 9, 0, 0, 0], dtype=int32))


Yay, works!