# Text generation with RNN

In this lab, we are going to generate text with RNNs.

We'll try to have a RNN learning the *fables de la Fontaine*.

Lets load into variable the *Fables*:

In [1]:
with open('./fables.txt',encoding="utf-8") as f:
    
    

    text = f.read()    #We remove the space, the tabulation etc to reduce the chance go get too many 
                       # spaces as precition outputs
    text=text.replace("  "," ")
    text=text.replace("   "," ")
    text=text.replace("    "," ")
    text=text.replace("      "," ")
    text=text.replace("       "," ")
    text=text.replace('\n \n','\n')
    text=text.replace('\n',"")




### Helpers

Define some methods to read this text
- a batch generator, generating batchs of text
- a decoder to translate a batch into stg more convinient

In [2]:
import numpy as np

vocab = sorted(set(text))  # my vocabulary (many letters)
print("I have", len(vocab), "different elements in  my text which are :")
print(' '.join(vocab))


def sample_gen(batch_size, n_items):
    """Return a random sample"""
    while True:
        permutations = list(np.random.permutation(len(text) - n_items))
        while len(permutations) > n_items + 1:
            # Generate a batch
            batch = []
            for i in range(batch_size):
                p = permutations.pop()
                batch.append(text[p : p + n_items])
            yield batch

def encode_batch(batch, one_hot=False):
    """Takes a batch of string as input and encode it to a numerical
    batch"""
    batch_new = np.ndarray((len(batch),len(batch[0])))
    batch_one = np.ndarray((len(batch), len(batch[0]), len(vocab))) #len batch = 2 : number of items in the array
                                                                    #len vocab = 90 : number of different elements
    
    if one_hot == True:  #If true, then the number corresponding to the character will be a 1 on its value position , while 
                         #all other digits within the array will be 0
        for i in range(len(batch)):
            for j in range(len(batch[0])):
                batch_one[i][j] = np.eye(1, len(vocab), vocab.index(batch[i][j]))
        return batch_one
    else:
        for i in range(len(batch)):
            for j in range(len(batch[0])):
                batch_new[i][j] = vocab.index(batch[i][j])
        return batch_new


a = sample_gen(2, 2)
b = next(a)

print(encode_batch(b, one_hot=False))
print(encode_batch(b, one_hot=True))


I have 90 different elements in  my text which are :
  ! " ' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; ? A B C D E F G H I J L M N O P Q R S T U V X Y Z ` a b c d e f g h i j l m n o p q r s t u v x y z À Â Ç É Ê Ô à â ç è é ê ë î ï ô ù û ﻿
[[  0.  32.]
 [ 59.   0.]]
[[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

### Sample of training taken from the web

In [3]:
# https://github.com/aymericdamien/TensorFlow-Examples/blob/master/notebooks/3_NeuralNetworks/recurrent_network.ipynb
import tensorflow as tf
from tensorflow.contrib import rnn

# Training Parameters
learning_rate = 0.001
training_steps = 100
batch_size = 128
display_step = 200

# Network Parameters
num_input = len(vocab)
timesteps = 28 # timesteps
num_hidden = 128 # hidden layer num of features
num_classes = len(vocab)

# tf Graph input
tf.reset_default_graph()
X = tf.placeholder("float", [None, timesteps, num_input])
Y = tf.placeholder("float", [None, num_classes])

# Define weights
W1 = tf.Variable(tf.random_normal([num_hidden, num_classes]))
B1 = tf.Variable(tf.random_normal([num_classes]))

def RNN(x, W1, B1):
    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, timesteps, n_input)
    # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)

    # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
    x = tf.unstack(x, timesteps, 1)

    # Define a lstm cell with tensorflow
    lstm_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)

    # Get lstm cell output
    outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], W1) + B1

with tf.name_scope('model'):
    logits = RNN(X, W1, B1)
    prediction = tf.nn.softmax(logits)

with tf.name_scope('loss'):
    loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        logits=logits, labels=Y))

with tf.name_scope('optimizer'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss_op)

with tf.name_scope('metrics')
    correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

SyntaxError: invalid syntax (<ipython-input-3-85b048b9e3bf>, line 55)

In [4]:
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    for step in range(1, training_steps+1):
        batch_x, batch_y = ??

        # Run optimization op (backprop)
        sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: batch_x,
                                                                 Y: batch_y})
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.4f}".format(loss) + ", Training Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")



SyntaxError: invalid syntax (<ipython-input-4-41bb0f4266a5>, line 11)

### Train a model (6/10)
Train a model that can learn to create text from a given input (letter wise)

Dont forget to explain what you do, why, and if it do look to be working

In [5]:
###We did this exercice within our hadoop project before we know there will be ann RNN python application.
###Because it allows to make a text generation prediction based on some inputs we thought it was relevant to use it here.
###Here is the source we based our code on : https://github.com/martin-gorner/tensorflow-rnn-shakespeare
###Just so you know, we don't use an RNN but a GRU (Gated recurrent units) network.
### comments with ### come from us
### Because the computation is very slow, you can find screenshots of results in the folder "Screenshots for results"
###Once lanched, you will not be able to launch again the cell if you interrupt it in the meantime 
 
# encoding: UTF-8
# Copyright 2017 Google.com
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
 
import tensorflow as tf
from tensorflow.contrib import layers
from tensorflow.contrib import rnn  # rnn stuff temporarily in contrib, moving back to code in TF 1.1
import os
import time
import math
import numpy as np
import my_txtutils as txt
 
 
tf.set_random_seed(1)
 
# model parameters
#
# Usage:
#   Training only:
#         Leave all the parameters as they are
#         Disable validation to run a bit faster (set validation=False below)
#         You can follow progress in Tensorboard: tensorboard --log-dir=log
#   Training and experimentation (default):
#         Keep validation enabled
#         You can now play with the parameters anf follow the effects in Tensorboard
#         A good choice of parameters ensures that the testing and validation curves stay close
#         To see the curves drift apart ("overfitting") try to use an insufficient amount of
#         training data (shakedir = "shakespeare/t*.txt" for example)
#
 
SEQLEN = 30   ###Sequence of inputs (characters)
BATCHSIZE = 100 #### Number of samples within our network. Here it means that it will be 100 times 30 char in each batch
ALPHASIZE = txt.ALPHASIZE ### Supported alaphabet. In our case it's ASCII-7
INTERNALSIZE = 512 ### Size of GRU cell . We can shoose it freely
NLAYERS = 3 ###Number of layers within the network
learning_rate = 0.001  ### fixed learning rate
dropout_pkeep = 0.8    ### some dropout (he will drop 20% of randomly selected neurons will be ignored during training)
#The activation of these neuron are removed. As a consequence any weight updates throught theses neuron is done in the backward process.
 
 
# load data (fables.txt file)
fabledir = "./fables.txt" #our input
codetext, valitext, bookranges = txt.read_data_files(fabledir, validation=True)
 
 
# display some stats on the data
 
epoch_size = len(codetext) // (BATCHSIZE * SEQLEN)
txt.print_data_stats(len(codetext), len(valitext), epoch_size)
 
 
#
# the model (see FAQ in README.md)
#
 
lr = tf.placeholder(tf.float32, name='lr')  # learning rate
pkeep = tf.placeholder(tf.float32, name='pkeep')  # dropout parameter
batchsize = tf.placeholder(tf.int32, name='batchsize')
 
# inputs
 
X = tf.placeholder(tf.uint8, [None, None], name='X')    # [ BATCHSIZE, SEQLEN ] #The text input for each batch
Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0)                 # [ BATCHSIZE, SEQLEN, ALPHASIZE ] #The text one_hot encoded where all char are have a 92 subdimension with a 1 in the letter they correspond
### expected outputs = same sequence shifted by 1 since we are trying to predict the next character
Y_ = tf.placeholder(tf.uint8, [None, None], name='Y_')  # [ BATCHSIZE, SEQLEN ] #The text output that should follow the sequence of input
Yo_ = tf.one_hot(Y_, ALPHASIZE, 1.0, 0.0)               # [ BATCHSIZE, SEQLEN, ALPHASIZE ] #The output one-hot encoded
 
# input state
Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE*NLAYERS], name='Hin')  # [ BATCHSIZE, INTERNALSIZE * NLAYERS]
 
# using a NLAYERS=3 layers of GRU cells, unrolled SEQLEN=30 times
# dynamic_rnn infers SEQLEN from the size of the inputs Xo
 
# How to properly apply dropout in RNNs: see README.md
cells = [rnn.GRUCell(INTERNALSIZE) for _ in range(NLAYERS)]
###Generate 3 grucells of 512 units. 
###This command generated automatically the weights and biases of the gru cell, 
###that's why we don't need to create in our model. 
 
 
# "naive dropout" implementation
dropcells = [rnn.DropoutWrapper(cell,input_keep_prob=pkeep) for cell in cells] 
###Here the neurons are dropped respectively to the probability specified to be keep (80%)
multicell = rnn.MultiRNNCell(dropcells, state_is_tuple=False) 
###Now we get our 3 cells after the drop. They are now put in a multicell that stack our cells to generated a 3 deep model
multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep)  
### dropout for the softmax layer same that for each cell, we remove 20% of the neuron that will be not activated temporaly
 
Yr, H = tf.nn.dynamic_rnn(multicell, Xo, dtype=tf.float32, initial_state=Hin) #We enroll the cell to obtain a 3 deep cell of 30 characters (here our X will always have the same size)
# Yr: [ BATCHSIZE, SEQLEN, INTERNALSIZE ]
# H:  [ BATCHSIZE, INTERNALSIZE*NLAYERS ] # this is the last state in the sequence
 
H = tf.identity(H, name='H')  # just to give it a name
 
# Softmax layer implementation:
# Flatten the first two dimension of the output [ BATCHSIZE, SEQLEN, ALPHASIZE ] => [ BATCHSIZE x SEQLEN, ALPHASIZE ]
# then apply softmax readout layer. This way, the weights and biases are shared across unrolled time steps.
# From the readout point of view, a value coming from a sequence time step or a minibatch item is the same thing.
 
Yflat = tf.reshape(Yr, [-1, INTERNALSIZE])    # [ BATCHSIZE x SEQLEN, INTERNALSIZE ] #The output of the multicell computation that get 2D
Ylogits = layers.linear(Yflat, ALPHASIZE)     # [ BATCHSIZE x SEQLEN, ALPHASIZE ] #The output become sequence of results for each possible letter
Yflat_ = tf.reshape(Yo_, [-1, ALPHASIZE])     # [ BATCHSIZE x SEQLEN, ALPHASIZE ] #The real output get the same action to be at correct dimension
loss = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Yflat_)  # [ BATCHSIZE x SEQLEN ]  #We get the loss comparing the output from the multicell and the real output
loss = tf.reshape(loss, [batchsize, -1])      # [ BATCHSIZE, SEQLEN ] #The loss is reshaped to correspond to each sequence of characters
Yo = tf.nn.softmax(Ylogits, name='Yo')        # [ BATCHSIZE x SEQLEN, ALPHASIZE ] #Softmax is apply to get a value between 0 and 1 
Y = tf.argmax(Yo, 1)                          # [ BATCHSIZE x SEQLEN ] #Argmax to get the higher value between the 92 possible char. This value will be 1 and the other one 0. 
Y = tf.reshape(Y, [batchsize, -1], name="Y")  # [ BATCHSIZE, SEQLEN ] #Here the reshape will get the number which have 1 (the value of the preicted letter)

train_step = tf.train.AdamOptimizer(lr).minimize(loss) #AdamOptimizer is used 
 
 
 
# stats for display
#You don't need to care about this session. it is for visualisation checkpoints and backup
 
seqloss = tf.reduce_mean(loss, 1)
batchloss = tf.reduce_mean(seqloss)
accuracy = tf.reduce_mean(tf.cast(tf.equal(Y_, tf.cast(Y, tf.uint8)), tf.float32))
loss_summary = tf.summary.scalar("batch_loss", batchloss)
acc_summary = tf.summary.scalar("batch_accuracy", accuracy)
summaries = tf.summary.merge([loss_summary, acc_summary])
 
# Init Tensorboard stuff. This will save Tensorboard information into a different
# folder at each run named 'log/<timestamp>/'. Two sets of data are saved so that
# you can compare training and validation curves visually in Tensorboard.
 
timestamp = str(math.trunc(time.time()))
summary_writer = tf.summary.FileWriter("log/" + timestamp + "-training")
validation_writer = tf.summary.FileWriter("log/" + timestamp + "-validation")
 
# Init for saving models. They will be saved into a directory named 'checkpoints'.
# Only the last checkpoint is kept.
 
if not os.path.exists("checkpoints"):
 
    os.mkdir("checkpoints")
saver = tf.train.Saver(max_to_keep=1000)
 
 
 
# for display: init the progress bar
 
DISPLAY_FREQ = 50
_50_BATCHES = DISPLAY_FREQ * BATCHSIZE * SEQLEN
progress = txt.Progress(DISPLAY_FREQ, size=111+2, msg="Training on next "+str(DISPLAY_FREQ)+" batches")
 
# init
istate = np.zeros([BATCHSIZE, INTERNALSIZE*NLAYERS])  # initial zero input state
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
step = 0
 
# training loop
#For all the input text for 12 iteration 
#x the input sequence, y_ the output that should be predicted
for x, y_, epoch in txt.rnn_minibatch_sequencer(codetext, BATCHSIZE, SEQLEN, nb_epochs=12):
    

    ###Training part###

    # train on one minibatch
    feed_dict = {X: x, Y_: y_, Hin: istate, lr: learning_rate, pkeep: dropout_pkeep, batchsize: BATCHSIZE}
    _, y, ostate = sess.run([train_step, Y, H], feed_dict=feed_dict)
    
    # log training data for Tensorboard display a mini-batch of sequences (every 50 batches)
    if step % _50_BATCHES == 0:
 
        feed_dict = {X: x, Y_: y_, Hin: istate, pkeep: 1.0, batchsize: BATCHSIZE}  # no dropout for validation
        y, l, bl, acc, smm = sess.run([Y, seqloss, batchloss, accuracy, summaries], feed_dict=feed_dict)
        txt.print_learning_learned_comparison(x, y, l, bookranges, bl, acc, epoch_size, step, epoch)
        summary_writer.add_summary(smm, step)
 
 
    # run a validation step every 50 batches
    # The validation text should be a single sequence but that's too slow (1s per 1024 chars!),
    # so we cut it up and batch the pieces (slightly inaccurate)
    # tested: validating with 5K sequences instead of 1K is only slightly more accurate, but a lot slower.
    
    ###Testing part###
    if step % _50_BATCHES == 0 and len(valitext) > 0:
 
        VALI_SEQLEN = 1*1024  # Sequence length for validation. State will be wrong at the start of each sequence.
        bsize = len(valitext) // VALI_SEQLEN
        txt.print_validation_header(len(codetext), bookranges)
        vali_x, vali_y, _ = next(txt.rnn_minibatch_sequencer(valitext, bsize, VALI_SEQLEN, 1))  # all data in 1 batch
        vali_nullstate = np.zeros([bsize, INTERNALSIZE*NLAYERS])
        feed_dict = {X: vali_x, Y_: vali_y, Hin: vali_nullstate, pkeep: 1.0,  # no dropout for validation
                     batchsize: bsize}
 
        ls, acc, smm = sess.run([batchloss, accuracy, summaries], feed_dict=feed_dict)
        txt.print_validation_stats(ls, acc)
 
        # save validation data for Tensorboard
        validation_writer.add_summary(smm, step)
 
 
 
    # display a short text generated with the current weights and biases (every 150 batches)

    ###It will be a 1000 char text based on the actuel weights and bias of our GRUs
 
    if step // 3 % _50_BATCHES == 0:
        txt.print_text_generation_header()
        ry = np.array([[txt.convert_from_alphabet(ord("K"))]])
        rh = np.zeros([1, INTERNALSIZE * NLAYERS])
 
        for k in range(1000):
 
            ryo, rh = sess.run([Yo, H], feed_dict={X: ry, pkeep: 1.0, Hin: rh, batchsize: 1})
            rc = txt.sample_from_probabilities(ryo, topn=10 if epoch <= 1 else 2)
            print(chr(txt.convert_to_alphabet(rc)), end="")
            ry = np.array([[rc]])
 
        txt.print_text_generation_footer()
 
 
 
    # save a checkpoint (every 500 batches)
 
    if step // 10 % _50_BATCHES == 0:
 
        saved_file = saver.save(sess, 'checkpoints/rnn_train_' + timestamp, global_step=step)
        print("Saved file: " + saved_file)
 
 
 
    # display progress bar
    progress.step(reset=step % _50_BATCHES == 0)
 
 
    # loop state around
 
    istate = ostate
    step += BATCHSIZE * SEQLEN

Loading file ./fables.txt
Training text size is 0.47MB with 0.00KB set aside for validation. There will be 163 batches per epoch

   0 (epoch 0) fables.txt │       Monseigneur le Dauphin\  │                                │ loss: 4.46961
  30 (epoch 0) fables.txt │  j'aie seulement excit   les a │                                │ loss: 4.46495
  60 (epoch 0) fables.txt │ que nous sommes l'abr  g   de  │                                │ loss: 4.49148
  90 (epoch 0) fables.txt │   -dire des deux personnages q │                                │ loss: 4.46177
 120 (epoch 0) fables.txt │ endre, et le mena  a que ses m │                                │ loss: 4.43034
 150 (epoch 0) fables.txt │  son esprit, que les choses s' │                                │ loss: 4.45112
 180 (epoch 0) fables.txt │ ne fit servir que le m  me met │                                │ loss: 4.45336
 210 (epoch 0) fables.txt │ i toutefois les dieux l'ordonn │                                │ loss: 4.48003
 240 (

esb ruan ernesc s  sit ssu nttrn iue iiesnveeuqessursen siOsurtsntuuOsr csec sienecOst eirs rssi nensur seniOs qernOntnins{te ece neruotteeunroi nvrnvrrcrisnvitn tevvrenvune uesuib ouue e ouiuiu1u1rs  ns unsineiein triEEqvttnnnsuveurie iO sitOeqrqeequviuunssncuunsOeeuOnseersu irOeOsuean sOia raneeoernrenu iiru  uuisrn e enur iie eusqus uuei u -n tn s e  u1or rni  renooe-i eune  ouronoinnutsur{ u{-ntn n-r ssi-rv ss etuu iboerrin riiOsi itnn inve{vuev {iusuun{i nvuinieirein  nrtvu rnquutvOvstuburuiicisiis ve!see eenOrereusOie rOurrsssO-i  rn re OternsnOeOiuusotnirn i n-cqttcs {srrrt stsse  rrun icciisresevirqO  uO n i u   in u r truso eu seon-siis seesqq ur-torsuOurruci-cr-quOsOiee!c!ssu  u!n  i itiu tn  noni  rinevruuouo  s  ri in1un rsse - rs-   t   itstniretsouitsisti cu ir{ur qqrne n  rsiiOrtnveeeeOeseonieo{ -rv-nn euu t{nrt ii turcr  esie euuvts Oe  s nietes nunn e1uuiensn1u -re-s-rr n- eir OOuuOeuourOnen-resiu-- r  eOOe rr - ru e e  erti -ri  -s  oinooiut   nnsnoieeunnus ru u1r-t-n

KeyboardInterrupt: 

### Train a model (4/10)
Train a model that can learn to create text from a given input (text wise). Using a word embeding seen in class, like CBOW

Dont forget to explain what you do, why, and if it do look to be working