### Character-level modelling implementation where model tries to predict the next character in a tweet

### Challenge to self: Convert implementation to LSTM to learn how to build models urself

In [1]:
import pandas as pd
import numpy as np
import time
import tensorflow as tf

In [2]:
data = pd.read_csv("dataset.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,date,favorites,retweets,hashtags
0,0,mobius strips are overrated theyre just bracel...,2020-08-03 04:07:12+00:00,5,1,
1,1,just realized that parasite's ramdon is a case...,2020-07-30 04:16:08+00:00,6,1,
2,2,We are actually fucked lmao,2020-07-30 03:55:00+00:00,6,1,
3,6,i think my parents love my dog more than they ...,2020-07-26 16:56:55+00:00,7,1,
4,7,i think using diesel to disinfect masks is a g...,2020-07-21 09:13:11+00:00,5,1,


In [4]:
data.text.str.len().describe()

count    434.000000
mean      90.235023
std       49.755523
min        9.000000
25%       54.250000
50%       83.500000
75%      115.750000
max      279.000000
Name: text, dtype: float64

In [5]:
text = data.text.str.cat(sep="\n")

In [6]:
print(text[:280])

mobius strips are overrated theyre just bracelets that u didnt put on correctly
just realized that parasite's ramdon is a case study for barthes' steak and chips thought experiment and now i want to kill myself for thinking abt barthes for fun
We are actually fucked lmao
i think 


In [7]:
vocab = sorted(set(text))

In [8]:
#vectorizing characters
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [9]:
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'mobius strips' ---- characters mapped to int ---- > [69 71 58 65 77 75  1 75 76 74 65 72 75]


In [10]:
#Length of a single input
seq_length = 50
examples_per_epoch = len(text)

#convert to tensor
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(i, ": ", idx2char[i.numpy()])

tf.Tensor(69, shape=(), dtype=int32) :  m
tf.Tensor(71, shape=(), dtype=int32) :  o
tf.Tensor(58, shape=(), dtype=int32) :  b
tf.Tensor(65, shape=(), dtype=int32) :  i
tf.Tensor(77, shape=(), dtype=int32) :  u


In [11]:
#Make batches of seq_length
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'mobius strips are overrated theyre just bracelets t'
'hat u didnt put on correctly\njust realized that par'
"asite's ramdon is a case study for barthes' steak a"
'nd chips thought experiment and now i want to kill '
'myself for thinking abt barthes for fun\nWe are actu'


In [12]:
#duplicate and shift (make train and target)
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [13]:
#Given [0:99], predict [1:100] 
for input_example, target_example in dataset.take(1):
    print(repr(''.join(idx2char[input_example])))
    print(repr(''.join(idx2char[target_example])), "\n")

'mobius strips are overrated theyre just bracelets '
'obius strips are overrated theyre just bracelets t' 



In [14]:
#visualization of training task for each rnn unit
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 69 ('m')
  expected output: 71 ('o')
Step    1
  input: 71 ('o')
  expected output: 58 ('b')
Step    2
  input: 58 ('b')
  expected output: 65 ('i')
Step    3
  input: 65 ('i')
  expected output: 77 ('u')
Step    4
  input: 77 ('u')
  expected output: 75 ('s')


In [15]:
batch_size = 64
buffer_size = 10000 #buffer size is for memory i think

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [16]:
#64 training examples of seq length 100
dataset

<BatchDataset shapes: ((64, 50), (64, 50)), types: (tf.int32, tf.int32)>

In [17]:
#build the thing (Personal task: change to LSTM to learn how to build ur own )
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True),
        
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True),
        
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True),
        
        tf.keras.layers.Dense(vocab_size) 
    ])
    
    return model

In [18]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 512

In [19]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

In [20]:
# 64 batch size, 100 seq length per batch, 86 vocab size (unique characters/tokens)
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 50, 83)


In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           21248     
_________________________________________________________________
lstm (LSTM)                  (64, None, 512)           1574912   
_________________________________________________________________
lstm_1 (LSTM)                (64, None, 512)           2099200   
_________________________________________________________________
lstm_2 (LSTM)                (64, None, 512)           2099200   
_________________________________________________________________
dense (Dense)                (64, None, 83)            42579     
Total params: 5,837,139
Trainable params: 5,837,139
Non-trainable params: 0
_________________________________________________________________


### Training
With the setup of RNN, text generation will be treated like a basic supervised classification problem (given char x, predict char y), meaning we can use standard backprop rules and loss functions (cross entropy)

In [22]:
#loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [23]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)

In [24]:
#mean loss for the batch of 64 is 4.45
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 50, 83)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.4192004


In [25]:
#Saving checkpoints (remember this lol)
import os

checkpoint_dir = 'D:/garbage_models/twitterbot'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

#saves weights in directory
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)


In [26]:
#compile model (adam optimizer because why not, loss = loss function)
model.compile(optimizer="adam", loss=loss)

In [27]:
epochs = 200
#train the model (training set, number of epochs, callback for saving checkpoints)
history = model.fit(dataset, epochs=epochs)

Train for 12 steps
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Ep

In [30]:
# prediction step will use batch size 1 because rnns take in input by a fixed batch size
# so its necessary to run it with a different batch size of 1 since our task is to predict the next character
# tf.train.latest_checkpoint(checkpoint_dir)
model.save_weights('./model')

In [31]:
#import weights to model 
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
# model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.load_weights('./model')
model.build(tf.TensorShape([1, None])) 

In [32]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            21248     
_________________________________________________________________
lstm_3 (LSTM)                (1, None, 512)            1574912   
_________________________________________________________________
lstm_4 (LSTM)                (1, None, 512)            2099200   
_________________________________________________________________
lstm_5 (LSTM)                (1, None, 512)            2099200   
_________________________________________________________________
dense_1 (Dense)              (1, None, 83)             42579     
Total params: 5,837,139
Trainable params: 5,837,139
Non-trainable params: 0
_________________________________________________________________


### Prediction Loop
Set starting string, <br>
Get next character using prediction distribution, <br>
Next character is used as input, <br>
repeat until size reached <br>
<br>
note that because an rnn has more info, the state changes and gets context from previous timesteps

In [33]:
def generate_text(model, start_string):
    #number of characters to generate
    num_generate = 300
    
    #vectorize start string to number
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0) #converts to tensor batch i think? 
    
    #results
    output = []
    
    #Adjust temperature for predictability (higher for more surprising text (idk either))
    temperature = 1.0
    
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        
        #remove batch dimension
        predictions = tf.squeeze(predictions, 0)
        
        #next character from categorical distribution
        predictions = predictions/temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        
        #preedicted character = next input
        input_eval = tf.expand_dims([predicted_id], 0)
        
        output.append(idx2char[predicted_id])
        
    return (start_string + ''.join(output))

In [34]:
generate_text(model, "a")

'ads\nthere\'s alvadoat have a box plot option on gsheets????????\nFunny mems kaory by joure" "morking mahing gat\nFon CODE the russian revolution and not be about communisme hungry for lechon\nAre you telling me that I finished im laptop how to interacc pose\nshare, and the one time it finally happens it\'s'