In [None]:
# importing tensorflow and numpy
import tensorflow as tf
import numpy as np

Get and prepare data

In [None]:
## Run once in cobab to retrieve shakespeare input file and prepare_data.py

!wget 'https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt'
!wget 'https://ovgu-ailab.github.io/idl2019/assignments/5/prepare_data.py'

--2020-05-25 20:07:45--  https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4573338 (4.4M) [text/plain]
Saving to: ‘shakespeare_input.txt’


2020-05-25 20:07:46 (13.8 MB/s) - ‘shakespeare_input.txt’ saved [4573338/4573338]

--2020-05-25 20:07:47--  https://ovgu-ailab.github.io/idl2019/assignments/5/prepare_data.py
Resolving ovgu-ailab.github.io (ovgu-ailab.github.io)... 185.199.108.153, 185.199.110.153, 185.199.111.153, ...
Connecting to ovgu-ailab.github.io (ovgu-ailab.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5611 (5.5K) [application/octet-stream]
Saving to: ‘prepare_data.py’


2020-05-25 20:07:47 (36.0 MB/s) - ‘prepare_data.py’ saved [5611/5611]



In [None]:
 !python prepare_data.py shakespeare_input.txt skp

2020-05-25 20:07:58.689847: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
Split input into 22981 sequences...
Serialized 100 sequences...
Serialized 200 sequences...
Serialized 300 sequences...
Serialized 400 sequences...
Serialized 500 sequences...
Serialized 600 sequences...
Serialized 700 sequences...
Serialized 800 sequences...
Serialized 900 sequences...
Serialized 1000 sequences...
Serialized 1100 sequences...
Serialized 1200 sequences...
Serialized 1300 sequences...
Serialized 1400 sequences...
Serialized 1500 sequences...
Serialized 1600 sequences...
Serialized 1700 sequences...
Serialized 1800 sequences...
Serialized 1900 sequences...
Serialized 2000 sequences...
Serialized 2100 sequences...
Serialized 2200 sequences...
Serialized 2300 sequences...
Serialized 2400 sequences...
Serialized 2500 sequences...
Serialized 2600 sequences...
Serialized 2700 sequences...
Serialized 2800 sequences...
Serialized 2900

In [None]:
from prepare_data import parse_seq
import pickle

# this is just a datasets of "bytes" (not understandable)
data = tf.data.TFRecordDataset("skp.tfrecords")

# this maps a parser function that properly interprets the bytes over the dataset
# (with fixed sequence length 200)
# if you change the sequence length in preprocessing you also need to change it here
data = data.map(lambda x: parse_seq(x, 200))

# a map from characters to indices
vocab = pickle.load(open("skp_vocab", mode="rb"))
vocab_size = len(vocab)
# inverse mapping: indices to characters
ind_to_ch = {ind: ch for (ch, ind) in vocab.items()}

print(vocab)
print(vocab_size)

{'k': 1, ']': 2, '[': 3, 'h': 4, 'N': 5, 'P': 6, 'x': 7, "'": 8, ':': 9, 'o': 10, '!': 11, 'Q': 12, 'X': 13, 'T': 14, '$': 15, 'c': 16, '-': 17, 'L': 18, 't': 19, 'R': 20, 'n': 21, 'K': 22, ',': 23, 'g': 24, ' ': 25, 'f': 26, 'G': 27, 'u': 28, 'i': 29, 'm': 30, 'p': 31, 'I': 32, 'W': 33, 'U': 34, 'F': 35, 'Y': 36, 'D': 37, ';': 38, '3': 39, 'B': 40, 'z': 41, 'J': 42, 'r': 43, 'A': 44, 'H': 45, 's': 46, 'E': 47, 'C': 48, 'q': 49, 'O': 50, 'l': 51, 'a': 52, 'V': 53, 'Z': 54, '?': 55, 'y': 56, 'S': 57, 'w': 58, 'e': 59, 'd': 60, 'M': 61, '&': 62, 'b': 63, '.': 64, '\n': 65, 'v': 66, 'j': 67, '<S>': 0}
68


In [None]:
data

<MapDataset shapes: (200,), types: tf.int32>

We need to create batches out of the encoded data and define the length of the sequence that needs to be processed in one step.

In [None]:
batch_size = 128
sequence_length = 200
buffer_size = 10000

data = data.shuffle(buffer_size).repeat().batch(batch_size)

In [None]:
data

<BatchDataset shapes: (None, 200), types: tf.int32>

In [None]:
# number of neurons in the hidden layer
n_hidden = 512

# using random normal distribution for generating intial weight values
random_normal = random_normal = tf.initializers.RandomNormal()

# Intitializing the weights
weights = {
  'w_xh' : tf.Variable(random_normal([vocab_size,n_hidden]),dtype=tf.dtypes.float32),
  'w_hh' : tf.Variable(random_normal([n_hidden,n_hidden]),dtype=tf.dtypes.float32),
  'w_ho' : tf.Variable(random_normal([n_hidden,vocab_size]),dtype=tf.dtypes.float32)
}
biases = {
  'b_h' : tf.Variable(tf.zeros([n_hidden]),dtype=tf.dtypes.float32),
  'b_o' : tf.Variable(tf.zeros([vocab_size]),dtype=tf.dtypes.float32)
}

all_trainable_variables = [weights['w_xh'],weights['w_hh'],biases['b_h'],weights['w_ho'],biases['b_o']]

In [None]:
# chosen after determing loss over many more steps.
# As I am aiming for average loss ~1.4, 5500 suffices.
steps = 5500

# initializing the ortimizer as Adam
optimizer = tf.optimizers.Adam() # learning rate defaults to 0.001

# initializing loss function as SparseCategoricalCrossentropy
# https://www.tensorflow.org/api_docs/python/tf/keras/losses/SparseCategoricalCrossentropy
loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
@tf.function
def rnn_on_sequence(sequence_batch):
  with tf.GradientTape() as tape:
    state = tf.zeros([tf.shape(sequence_batch)[0], n_hidden])
    total_loss = tf.constant(0, dtype=tf.dtypes.float32)

    for time_step in range(sequence_length-1):
      input = tf.one_hot(sequence_batch[:,time_step], vocab_size)
      state = tf.nn.tanh(tf.matmul(input,weights['w_xh']) + tf.matmul(state, weights['w_hh']) + biases['b_h'])
      logits = tf.matmul(state, weights['w_ho']) + biases['b_o']

      loss = loss_fn(sequence_batch[:,time_step+1], logits)

      total_loss += loss

    total_loss /= tf.cast(tf.shape(sequence_batch[1])-1, dtype=tf.dtypes.float32)
  gradients = tape.gradient(total_loss, all_trainable_variables)

  ## gradient clipping
  glob_norm = tf.linalg.global_norm(gradients)
  gradients = [g/glob_norm for g in gradients]

  optimizer.apply_gradients(zip(gradients,all_trainable_variables))

  return total_loss

In [None]:
for step, sequence in enumerate(data):
  #print(sequence.shape)
  xent_avg = rnn_on_sequence(sequence)
  if not step % 200:
    print(f'Currently on step {step}. The average loss is {xent_avg}')
  if step > steps:
    break

Currently on step 0. The average loss is [4.3381653]
Currently on step 200. The average loss is [2.2762394]
Currently on step 400. The average loss is [2.1251552]
Currently on step 600. The average loss is [2.0109375]
Currently on step 800. The average loss is [1.9201553]
Currently on step 1000. The average loss is [1.8323739]
Currently on step 1200. The average loss is [1.7784071]
Currently on step 1400. The average loss is [1.7079808]
Currently on step 1600. The average loss is [1.6818286]
Currently on step 1800. The average loss is [1.645739]
Currently on step 2000. The average loss is [1.6288478]
Currently on step 2200. The average loss is [1.5959691]
Currently on step 2400. The average loss is [1.5505608]
Currently on step 2600. The average loss is [1.5306225]
Currently on step 2800. The average loss is [1.5416465]
Currently on step 3000. The average loss is [1.523194]
Currently on step 3200. The average loss is [1.4927349]
Currently on step 3400. The average loss is [1.5014752]
C

In [None]:
def samples(n_steps):
  state = tf.zeros([1,n_hidden])
  gen = [0]

  for step in range(n_steps):
    state = tf.nn.tanh(tf.matmul(tf.one_hot(gen[-1:],depth=vocab_size), weights['w_xh']) + tf.matmul(state,weights['w_hh']) + biases['b_h'])
    probs = tf.nn.softmax(tf.matmul(state, weights['w_ho']) + biases['b_o']).numpy()[0]

    gen.append(np.random.choice(vocab_size, p=probs))

  return ''.join([ind_to_ch[ind] for ind in gen])

In [None]:
print(samples(500))

<S>ave with far, and had sharty
With a moouthing, hear the holy hand,
For with a marked pasting to her honest
And leves no keeper, with the offer knave,
'To't 'Alove sill'd I but peace.

KING HENRY V:
I am sable such hiddes, which he my sapts
Stood to see it, and much all merding ncect
One of the bold end.
I shull be the see of his bodyor and no
lime--
To the request's black and not time,
I where no meam, burn this all by tender,
Mine eve ton un soury in yonder bring.

DUKE VINCENTIO:
Saith, I know
