In the previous notebook we created TFrecords from the bible. In the notebook we'll take a look at them and then build a model that predicts the Book of the bible a verse came from

In [1]:
import tensorflow as tf
from preppy import Preppy
class BibPreppy(Preppy):
    '''
    We'll slightly extend to way we right tfrecords to store the id of the book it came from
    '''
    def sequence_to_tf_example(self,sequence,book_id):
        id_list = self.sentance_to_id_list(sequence)
        ex = tf.train.SequenceExample()
        # A non-sequential feature of our example
        sequence_length = len(sequence)
        ex.context.feature["length"].int64_list.value.append(sequence_length)
        ex.context.feature["book_id"].int64_list.value.append(book_id)
        # Feature lists for the two sequential features of our example
        fl_tokens = ex.feature_lists.feature_list["tokens"]

        for token in id_list:
            fl_tokens.feature.add().int64_list.value.append(token)

        return ex
    @staticmethod
    def parse(ex):
        '''
        Explain to TF how to go froma  serialized example back to tensors
        :param ex:
        :return:
        '''
        context_features = {
            "length": tf.FixedLenFeature([], dtype=tf.int64),
            "book_id": tf.FixedLenFeature([], dtype=tf.int64)
        }
        sequence_features = {
            "tokens": tf.FixedLenSequenceFeature([], dtype=tf.int64),
        }

        # Parse the example (returns a dictionary of tensors)
        context_parsed, sequence_parsed = tf.parse_single_sequence_example(
            serialized=ex,
            context_features=context_features,
            sequence_features=sequence_features
        )
        return {"seq":sequence_parsed["tokens"], "length": context_parsed["length"], 
                "book_id": context_parsed["book_id"]}



Make a dataset by reading the train 

In [2]:
tf.reset_default_graph()

dataset = tf.data.TFRecordDataset(['./train.tfrecord']).map(BibPreppy.parse)


In [3]:
iterator = dataset.make_one_shot_iterator()
next_item = iterator.get_next()

In [4]:
sess =tf.InteractiveSession()

In [5]:
sess.run(next_item)

{'book_id': 3,
 'length': 59,
 'seq': array([25, 14, 13,  5, 11,  8,  7,  5, 27, 28, 29, 30,  5, 10,  6,  9, 36,
         7,  5, 16, 14, 11,  3,  5, 41,  3, 10,  7, 10,  5,  9, 14, 13,  5,
        16, 14, 11,  3,  5, 25,  9,  4,  3, 14, 15,  5, 10,  9, 34, 18, 14,
        22, 15,  5, 62, 64, 39, 65, 62])}

In [6]:
dataset.output_shapes

{'book_id': TensorShape([]),
 'length': TensorShape([]),
 'seq': TensorShape([Dimension(None)])}

In [7]:

def expand(x):
    x['length'] = tf.expand_dims(tf.convert_to_tensor(x['length']),0)
    x['book_id'] = tf.expand_dims(tf.convert_to_tensor(x['book_id']),0)
    return x
def deflate(x):
    x['length'] = tf.squeeze(x['length'])
    x['book_id'] = tf.squeeze(x['book_id'])
    return x



In [8]:
batch_iter = dataset.map(expand).padded_batch(128,padded_shapes={
    "book_id":1,
    "length":1,
    "seq":tf.TensorShape([None])
}).map(deflate)
next_item = batch_iter.repeat().make_one_shot_iterator().get_next()

In [9]:
sess.run(next_item)

{'book_id': array([  3,   0, 214,  10,   4,   2,  17, 188, 194,   0,   1, 188, 201,
          3,   3,   5,   5,   6, 214,   3,   3, 192,   0, 192, 190,   3,
          8,  14, 170,  10, 214,  12, 197,   0, 180, 190, 189,   3, 173,
          4, 171,   4, 195,   3, 173, 210,  10, 174, 173, 171, 168,   8,
          3, 214, 167,   8,   0, 191, 206,  13, 180, 189, 175,   2, 167,
        175, 188,   5,  10, 170, 185,   4,  10,   4,   6, 198, 194,   8,
          1,   6,  14, 167,  10, 170, 191, 173,  12,   9,   1, 189,   3,
          1, 167,  10, 189, 180,  12,   4, 171,   8, 190, 206,   4,   4,
          6, 191,  11, 189, 174, 190,  14,   6,   8,   2,  11, 190, 170,
        203, 191, 191,   0, 171, 193, 192, 198,   1,   9,  10]),
 'length': array([59, 62, 64, 65, 30, 64, 64, 28, 61, 23, 19, 54, 20, 63, 44, 63, 66,
        65, 63, 62, 64, 56, 36, 58, 65, 63, 63, 64, 62, 60, 55, 63, 53, 63,
        62, 65, 63, 28, 60, 12, 21, 63, 61, 64, 62, 65, 65, 64, 63, 65, 62,
        64, 62, 13, 64, 63, 4

In [10]:
class Model():
    def __init__(self,inputs):
        sequence =  inputs['seq']
        lengths = inputs['length']
        book_id = inputs['book_id']
        self.lr = tf.placeholder(shape=None,dtype=tf.float32)
        
        
        emb_vec = tf.get_variable("emb",dtype=tf.float32,shape=[74,32])
        emb_source = tf.nn.embedding_lookup(emb_vec,sequence)
        
        
        cell = tf.nn.rnn_cell.GRUCell(128)
        outputs, state = tf.nn.dynamic_rnn(cell,emb_source,dtype=tf.float32,sequence_length=lengths)
        
        book_logits =  tf.contrib.layers.fully_connected(state,num_outputs=64,activation_fn=tf.tanh)
        book_logits =  tf.contrib.layers.fully_connected(state,num_outputs=215,activation_fn=None)
        
        loss = tf.losses.sparse_softmax_cross_entropy(book_id,book_logits)
        self.loss = tf.reduce_mean(loss)
        opt = tf.train.AdamOptimizer(self.lr)
        self.train = opt.minimize(self.loss)


    

In [11]:
M = Model(next_item)
sess.run(tf.global_variables_initializer())
from IPython.display import clear_output


In [None]:
num =1
import sys
while True:
    try:
        _,loss = sess.run([M.train,M.loss],feed_dict={M.lr:0.0001})
        if num %30==0:
            clear_output()
        num+=1
        sys.stdout.write("\r" + str(loss))
        sys.stdout.flush()
    except:
        pass

    

3.3803573