In [None]:
import tensorflow as tf
import numpy as np
import os
import time

In [None]:

path_to_file = '/content/lyrics_dataset.txt'

In [None]:

# Opening the text file in read mode and standard encoding it
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 404640 characters


In [None]:
# A look at the first 250 characters in text
print(text[:100])

I hate you for what you did
And I miss you like a little kid
I faked it every time
But that's alr


In [None]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

98 unique characters


In [None]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])


print('{ ===========>')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n==========>}')


# Show how the first 20 characters from the text are mapped to integers
print ('{} ==> characters mapped to int ==> {}'.format(repr(text[:20]), text_as_int[:20]))


  '\n':   0,
  '\r':   1,
  ' ' :   2,
  '!' :   3,
  '"' :   4,
  '&' :   5,
  "'" :   6,
  '(' :   7,
  ')' :   8,
  '*' :   9,
  ',' :  10,
  '-' :  11,
  '.' :  12,
  '/' :  13,
  '0' :  14,
  '1' :  15,
  '2' :  16,
  '3' :  17,
  '4' :  18,
  '5' :  19,
  ...
'I hate you for what ' ==> characters mapped to int ==> [35  2 61 54 73 58  2 78 68 74  2 59 68 71  2 76 61 54 73  2]


In [None]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()] , end = "")


# Using batch method converted individual characters to sequences of desired size
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

I hat"I hate you for what you did\r\nAnd I miss you like a little kid\r\nI faked it every time\r\nBut that's alri"
'ght\r\nI can hardly feel anything\r\nI hardly feel anything at all\r\nYou gave me fifteen hundred\r\nTo see y'
'our hypnotherapist\r\nI only went one time\r\nYou let it slide\r\nFell on hard times a year ago\r\nWas hoping'
' you would let it go, and you did\r\nI have emotional motion sickness\r\nSomebody roll the windows down\r\n'
"There are no words in the English language\r\nI could scream to drown you out\r\nI'm on the outside looki"


In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))


Input data:  "I hate you for what you did\r\nAnd I miss you like a little kid\r\nI faked it every time\r\nBut that's alr"
Target data: " hate you for what you did\r\nAnd I miss you like a little kid\r\nI faked it every time\r\nBut that's alri"


In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset


# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1500 # keep between (1024 -> 1800) for best results


In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([

    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),

    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),

    tf.keras.layers.Dense(vocab_size,activation='relu'),

    tf.keras.layers.Dropout(0.2),
  ])
  return model


model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)


for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 98) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()


sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()


sampled_indices


print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           25088     
                                                                 
 gru (GRU)                   (64, None, 1500)          7911000   
                                                                 
 dense (Dense)               (64, None, 98)            147098    
                                                                 
 dropout (Dropout)           (64, None, 98)            0         
                                                                 
Total params: 8083186 (30.83 MB)
Trainable params: 8083186 (30.83 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Input: 
 "mes our way\r\nAnd we ride down the kings highway\r\n\r\nNo you can't hide out\r\nIn a six gun town\r\nWe wann"

Next Char Predictions: 
 

In [None]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())


Prediction shape:  (64, 100, 98)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.5846987


In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:


EPOCHS=5


history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
def generate_text(model, chars_to_generate , temp , start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = chars_to_generate

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = temp

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_5'

In [None]:

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [None]:
def generate_text(model, chars_to_generate , temp , start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = chars_to_generate

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = temp

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))


In [None]:
from numpy import arange

# Number of characters to generate (keep between 250 to 500)
chars_to_generate = 500

# Printing the generated text
# Temperature 1.0 gives the craziest output and 0.1 gives the lowest varience
# Keeping the temperature 0.35 gives best meaningful / coherent text.

# Give the seed string as the first word of generate text
print(generate_text(model , chars_to_generate , 0.35 , start_string=u"brook"))

# Uncomment below to check the variences ==>

# for i in arange(0.1,1.1,0.1):
#   print("==============")
#   print("FOR TEMP : {} ".format(i))
#   print("==============")
#   print(generate_text(model , chars_to_generate , i , start_string=u"Love "))
#   print()


slipper fill in the way with me poon
The love, we'll be my beade
I don't need you were me pace to the way the love
It's not the light with my light


I could be with the love wern the plawe


So I don't need you were me heart


I don't wenna the cander come come in my headted


It's not the paster, when I'm sood me love


It's not I want a fall for a pristed the wind of my heart and she love
I can't heart in my the wante to the roon


There when I can't not the cand to the plowe w
