# Natural Language Processing in Tensorflow
---

Tensorflow is an open-source library widely used for machine learning. It is second only to the Python machine learning library Scikit-learn in terms of use and popularity. Similar to Keras, it allows a user to implement a powerful deep learning model in relatively few lines of code.


Many sophisticated applications today make use of Tensorflow-based machine learning to exploit the wealth of data at their disposal. This includes voice recognition, machine translation and sentimement analysis, image recognition, video detection and recommender systems.

In this example, we'll implement a simple Recurrent Neural Network which makes use of a Gated Recurrent Unit (GRU) which enables RNNs to have longer term memory, giving them similar performance as LSTMs and sometimes better on small datasets.

The model is a character-level RNN which aims to predict the next character in a sequence, based on a prior sequence.


In [40]:
import sys
import my_txtutils as txt
import numpy as np
import tensorflow as tf
import tensorflow.contrib as tfc

import os
import time

tf.set_random_seed(0)

# Reset the Tensorflow Computational Graph

tf.reset_default_graph()

In [41]:
# Combine all the texts to form a data superset

filenames = ['texts/emma.txt', 'texts/mansfield.txt', 'texts/northanger.txt',
             'texts/persuasion.txt', 'texts/pride.txt', 'texts/sense.txt', 'texts/susan.txt']
with open('texts/output.txt', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

In [42]:
#data = open("texts/output.txt", "r").read().lower()

# If you want to distinguish between uppercase and lowercase characters

data = open("texts/output.txt", "r").read()

In [43]:
# Model Architecture

# Sequence Length: the number of characters in a single training example

seq_length = 30

# Batch Size: 

batch_size = 20

# Number of neurons in the hidden layer

hidden_layer = 128

# Number of hidden layers

no_layers = 1

# Proportion of data set used for training

train_prop = 0.95

# Proportion of data set used for validation

val_prop = 0.05

# Inverse of dropout rate: each connection between layers will only be used with this probability

keep_prob = 0.8


# Split data into training set and validation set


total_seqs = len(data) / seq_length

train_seqs = int(train_prop * total_seqs)
val_seqs = int(val_prop*total_seqs)

train_data = data[:train_seqs*seq_length]
val_data = data[:val_seqs*seq_length]

train_data_len = len(train_data)
val_data_len = len(val_data)

# Resize data so that it is evenly divisible by 

data = data[:(train_seqs+val_seqs)*seq_length]


chars = sorted(list(set(data)))

char_to_int = dict((ch, i) for i, ch in enumerate(chars))
int_to_char = dict((i, ch) for i,ch in enumerate(chars))

vocab_size = len(chars)
text_size = len(data)

# Epoch size 

epoch_size = len(train_data) // (batch_size*seq_length)

In [44]:
# Encode training and validation data to integers

"""
For training purposes, here we encode each character in the text as an integer value.

e.g. 'a' is encoded to the value 60
"""

train_encoded = [char_to_int[char] for char in train_data]
val_encoded = [char_to_int[char] for char in val_data]

In [45]:
# Some statistics

print "There are", len(train_data), " characters in the training set."
print "There are", len(val_data), " characterists in the validation set."
print "Each epoch consists of", epoch_size, "passes of length", batch_size*seq_length



There are 4173570  characters in the training set.
There are 219660  characterists in the validation set.
Each epoch consists of 6955 passes of length 600


In [46]:
# Placeholders

""" 
A placeholder is a variable to which we assign data at a future stage.
It allows one to create operations and build computation graph without needing data.
The computation graph is then "fed" data through these placeholders
"""

# Keep probability

keep_prob_placeholder = tf.placeholder(tf.float32, name = 'keep_prob')

# Batch-size

batchsize = tf.placeholder(tf.int32, name = 'batchsize')

# Inputs

input_data = tf.placeholder(tf.uint8, [None, None], name = 'X')

# Inputs one-hot encoded

input_encoded = tf.one_hot(input_data, vocab_size, 1.0, 0.0)

# Outputs

output_data = tf.placeholder(tf.uint8, [None, None], name = 'Y_')
output_encoded = tf.one_hot(output_data, vocab_size, 1.0, 0.0)

# Input state

hidden_state = tf.placeholder(tf.float32, [None, hidden_layer*no_layers], name = 'Hidden')





In [47]:
# Gated Recurrent Unit

cells = [tfc.rnn.GRUCell(hidden_layer) for _ in range(no_layers)]

In [48]:
# Naive dropout

dropcells = [tfc.rnn.DropoutWrapper(cell, input_keep_prob = keep_prob_placeholder) for cell in cells]

multicell = tfc.rnn.MultiRNNCell(dropcells, state_is_tuple=False)

multicell = tfc.rnn.DropoutWrapper(multicell, output_keep_prob=keep_prob_placeholder) # dropout for softmax layer

# Yr = outputs = tensor of shape [batch_size, seq_length, hidden_layer ] = [20, 30, 128]
# H = hidden state = tensor of shape [batch_size, hidden_layer] = [20, 128]

Yr, H = tf.nn.dynamic_rnn(cell, input_encoded, dtype=tf.float32, initial_state=hidden_state)

# Softmax Layer implementation

    # Flatten first two dimensions of output
    
W = tf.Variable(tf.random_normal([hidden_layer, vocab_size]))
B = tf.Variable(tf.random_normal([vocab_size]))

# Reshape 

Yflat = tf.reshape(Yr, [-1, hidden_layer])
Ylogits = tf.matmul(Yflat, W) + B

Yflat_ = tf.reshape(output_encoded, [-1, vocab_size])

In [49]:
# Loss function

loss = tf.nn.softmax_cross_entropy_with_logits(logits = Ylogits, labels = Yflat_)
loss = tf.reshape(loss, [batchsize, -1])

In [50]:
Yo = tf.nn.softmax(Ylogits, name = 'Yo')
Y = tf.argmax(Yo, 1)
Y = tf.reshape(Y, [batchsize, -1], name = "Y")

# Stochastic Gradient Descent algorithm for minimizing loss function

## ADAM - Adaptive Moment Estimation

Like Adagrad, Adam computes adaptive learning rates for each parameter.
It stores an exponentially decaying average of the past squared gradients - similar to RMSprop.
Similar to momentum, Adam also retains an exponentially decaying past gradients.


$ m_t$ = decaying average of past gradients  
$ v_t$ = decaying average of past squared gradients

$$m_t = \beta_1m_{t-1} + (1-\beta_1)g_t $$
$$v_t = \beta_2v_{t-1} + (1-\beta_2)g^2_t $$



In [51]:
train_step = tf.train.AdamOptimizer(0.001).minimize(loss)

In [53]:
# Stats for Display

seqloss = tf.reduce_mean(loss,1)
batchloss = tf.reduce_mean(seqloss)
accuracy = tf.reduce_mean(tf.cast(tf.equal(output_data, tf.cast(Y,tf.uint8)), tf.float32))

In [54]:
# Progress Bar

display_freq = 50

_50_batches  = display_freq * batch_size * seq_length

progress = txt.Progress(display_freq, size = 111+2, msg = "Training on next" + str(display_freq) + " batches")


In [55]:
# Initialise

istate = np.zeros([batch_size, hidden_layer*no_layers]) # initial zero input state
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
step = 0

In [56]:
# Sample from array of probabiltiies

def sample_from_probabilities(probabilities, topn=vocab_size):
    """Roll the dice to produce a random integer in the [0..vocab_size] range,
    according to the provided probabilities. If topn is specified, only the
    topn highest probabilities are taken into account.
    :param probabilities: a list of size vocab_size with individual probabilities
    :param topn: the number of highest probabilities to consider. Defaults to all of them.
    :return: a random integer
    """
    p = np.squeeze(probabilities)
    p[np.argsort(p)[:-topn]] = 0
    p = p / np.sum(p)
    return np.random.choice(vocab_size, 1, p=p)[0]

In [58]:
# Training loop

for x, y_, epoch in txt.rnn_minibatch_sequencer(train_encoded, batch_size, seq_length, nb_epochs=2):
    
    # train on one minibatch
    
    feed_dict = {input_data:x,
                 output_data:y_,
                 hidden_state: istate,
                 keep_prob_placeholder: keep_prob,
                 batchsize: batch_size}
    
    _, y, ostate = sess.run([train_step, Y, H], feed_dict = feed_dict)
    
    # Display a short text generated with current weights and biases (every 150 batches)
    
    if step // 3 % _50_batches == 0:
        
        print
        print "Generating Random Text"
    
        ry = np.array([[char_to_int['k']]])
        rh = np.zeros([1, hidden_layer*no_layers])
    
        for k in range(1000):

            ryo, rh = sess.run([Yo, H], feed_dict = {input_data: ry,
                                                     keep_prob_placeholder: 1.0,
                                                     hidden_state: rh,
                                                     batchsize: 1})
            
            rc = sample_from_probabilities(ryo, topn = 10 if epoch <= 1 else 2)
            
            sys.stdout.write(int_to_char[rc])
            
            ry = np.array([[rc]])
        
        print
        print "TEXT-GEN DONE"
        print
    
    # display progress bar
    
    progress.step(reset = step % _50_batches == 0)
    
    # loop state around
    
    istate = ostate
    step += batch_size * seq_length
    
print
print "---Finished Training---"
    
    
    


Generating Random Text
W&
W
U
&2'&T0W3
]�qUT0�ZT0
&]
&0]
��
&

]

��00]WU]WT0
0&T0�

]UU]g:TW��
:�T�40U]T
000�0�U0
TTW]Z�
ZZ&0


T&T
�
]&&]
&
T
]&]g
W
WB
�]�0W��W:0�Z4&T]&
&000T
0
TUT
T]�]U
&W&q40

�T
&W0
&T0WvU0T]&]vW
&&'��
&&W
,0]�TT
Z&
'gW

Z&02TZ
2W�
q2�
Z
U&W
&T&WW
qqZ44J&00-W3
]&3g3&WE
&�&,0
�T]
Z&TT&,&
W,�T]W�q
0T0T0�]&W0W�
&]
���W0T0�U:TU
�::U]U
�:T]&0U]]U0
]�TT��W�0]Ug0T]W]3UE��0T�WWT
qf
&
T&,T]�WWW
&qW
q0
]U�ZU
Z&
&'
&W&'
'�
q

q&'0K
Z0]�T
0TWUU&00�TT]WWWqq
4&
400&
�
&&'TU

]��0]W
&W
&&
0T
&Wq
�&


&0W&qU
W
&
�0&W
T�U]
W0
Wq�&TU�WU�]WT&Wq
&�ZU0

T&0W&],
0&TT0&00U
0T0TT0&

��]
&&

�f&W�
&,
UUT]W0

T]T]WW��T&�00
TW
T]&0]Z0gg0
0
&T'00
��U0
T�T0]]�U�W0Z�&
W�
00UW&

]
�
]&W&q
&�U
&
&&W00
$,W0

[T&
0�
&]
U&&T&WW&0
00
T&WU]v'0]WW3T3&W�q

&U]00T0

&0
U&
�0

T&


f&UTUU]]

]�Z�

U]&0]
�U
U]
n00W&
UW
qq&T
]qWq2Z
22&
�0
U�&
0
�0WT]
��]WZgUg
]U0W0

U��TTTT0"
W&W
&
0W&U

]&0,

0&T0]W3�T&
]W�U
U:]�Z0U3UTg0W:00
c0U

cWTWU�T2
&�WU]3TU]n
&Ug0g]300


&�
&T0]
]T�0�]�0�ZZ4gU
Ug]0TT

&
T&]W
&WU


KeyboardInterrupt: 