### Character-level modelling implementation where model tries to predict the next character in a tweet

In [1]:
import pandas as pd
import numpy as np
import time
import tensorflow as tf

In [2]:
data = pd.read_csv("dataset.csv")

In [77]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,date,favorites,retweets,hashtags
0,0,imagine recycling old game mechanics and calli...,2020-06-21 05:19:43+00:00,4,0,
1,1,imagine making ellie sing a full acoustic cove...,2020-06-21 04:49:42+00:00,3,0,
2,2,imagine a studio expecting their game to be a ...,2020-06-21 04:43:58+00:00,10,0,
3,4,Our love for anime girls actually stems from t...,2020-06-19 03:12:37+00:00,9,3,
4,5,i find it hard to believe that women should we...,2020-06-15 05:12:23+00:00,17,4,


In [43]:
data.text.str.len().describe()

count    646.000000
mean      85.597523
std       49.488161
min        9.000000
25%       48.250000
50%       79.000000
75%      112.000000
max      279.000000
Name: text, dtype: float64

In [4]:
text = data.text.str.cat(sep="\n")

In [18]:
print(text[:280])

imagine recycling old game mechanics and calling the last of us 2 an innovative game
imagine making ellie sing a full acoustic cover of take on me and expecting the last of us 2 to be a game of the year contender
imagine a studio expecting their game to be a masterpiece after ove


In [19]:
vocab = sorted(set(text))

In [21]:
#vectorizing characters
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [27]:
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'imagine recyc' ---- characters mapped to int ---- > [68 72 60 66 68 73 64  1 77 64 62 84 62]


In [55]:
#Length of a single input
seq_length = 100
examples_per_epoch = len(text)

#convert to tensor
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(i, ": ", idx2char[i.numpy()])

tf.Tensor(68, shape=(), dtype=int32) :  i
tf.Tensor(72, shape=(), dtype=int32) :  m
tf.Tensor(60, shape=(), dtype=int32) :  a
tf.Tensor(66, shape=(), dtype=int32) :  g
tf.Tensor(68, shape=(), dtype=int32) :  i


In [56]:
#Make batches of seq_length
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'imagine recycling old game mechanics and calling the last of us 2 an innovative game\nimagine making e'
'llie sing a full acoustic cover of take on me and expecting the last of us 2 to be a game of the year'
' contender\nimagine a studio expecting their game to be a masterpiece after overworking its devs so mu'
'ch that one of them threatened to leak the game just to get paid\nOur love for anime girls actually st'
'ems from the greeks who made an entire religion out of a 2D right triangle\ni find it hard to believe '


In [57]:
#duplicate and shift (make train and target)
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [60]:
#Given [0:99], predict [1:100] 
for input_example, target_example in dataset.take(1):
    print(repr(''.join(idx2char[input_example])))
    print(repr(''.join(idx2char[target_example])), "\n")

'imagine recycling old game mechanics and calling the last of us 2 an innovative game\nimagine making '
'magine recycling old game mechanics and calling the last of us 2 an innovative game\nimagine making e' 



In [61]:
#visualization of training task for each rnn unit
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 68 ('i')
  expected output: 72 ('m')
Step    1
  input: 72 ('m')
  expected output: 60 ('a')
Step    2
  input: 60 ('a')
  expected output: 66 ('g')
Step    3
  input: 66 ('g')
  expected output: 68 ('i')
Step    4
  input: 68 ('i')
  expected output: 73 ('n')


In [62]:
batch_size = 64
buffer_size = 10000 #buffer size is for memory i think

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [65]:
#64 training examples of seq length 100
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [66]:
#build the thing (Personal task: change to LSTM to learn how to build ur own )
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        
        tf.keras.layers.Dense(vocab_size) 
    ])
    
    return model

In [67]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [68]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

In [75]:
# 64 batch size, 100 seq length per batch, 86 vocab size (unique characters/tokens)
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 100, 86)


In [76]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           22016     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 86)            88150     
Total params: 4,048,470
Trainable params: 4,048,470
Non-trainable params: 0
_________________________________________________________________


### Training
With the setup of RNN, text generation will be treated like a basic supervised classification problem (given char x, predict char y), meaning we can use standard backprop rules and loss functions (cross entropy)

In [78]:
#loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [79]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)

In [83]:
#mean loss for the batch of 64 is 4.45
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 86)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.453286
