### Character-level modelling implementation where model tries to predict the next character in a tweet

### Challenge to self: Convert implementation to LSTM to learn how to build models urself

In [1]:
import pandas as pd
import numpy as np
import time
import tensorflow as tf

In [2]:
data = pd.read_csv("dataset.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,date,favorites,retweets,hashtags
0,0,mobius strips are overrated theyre just bracel...,2020-08-03 04:07:12+00:00,5,1,
1,1,when is the next long weekend,2020-08-03 02:50:52+00:00,10,0,
2,2,just realized that parasite's ramdon is a case...,2020-07-30 04:16:08+00:00,6,1,
3,3,We are actually fucked lmao,2020-07-30 03:55:00+00:00,6,1,
4,4,To this day i still have no idea when i should...,2020-07-30 03:48:42+00:00,2,0,


In [4]:
data

Unnamed: 0.1,Unnamed: 0,text,date,favorites,retweets,hashtags
0,0,mobius strips are overrated theyre just bracel...,2020-08-03 04:07:12+00:00,5,1,
1,1,when is the next long weekend,2020-08-03 02:50:52+00:00,10,0,
2,2,just realized that parasite's ramdon is a case...,2020-07-30 04:16:08+00:00,6,1,
3,3,We are actually fucked lmao,2020-07-30 03:55:00+00:00,6,1,
4,4,To this day i still have no idea when i should...,2020-07-30 03:48:42+00:00,2,0,
...,...,...,...,...,...,...
983,983,Applies to the advice i give as well,2018-01-01 12:54:14+00:00,0,0,
984,984,Jan 5-6 ah,2018-01-01 12:24:09+00:00,3,0,
985,985,What was your favorite anime of 2017 and why i...,2018-01-01 05:38:32+00:00,0,0,
986,986,M R . W O R L D W I D E,2018-01-01 03:32:46+00:00,0,0,


In [5]:
data.text.str.len().describe()

count    988.000000
mean      78.675101
std       47.504095
min        2.000000
25%       42.000000
50%       71.000000
75%      105.250000
max      279.000000
Name: text, dtype: float64

In [6]:
text = data.text.str.cat(sep="~")

In [7]:
print(text[:280])

mobius strips are overrated theyre just bracelets that u didnt put on correctly~when is the next long weekend~just realized that parasite's ramdon is a case study for barthes' steak and chips thought experiment and now i want to kill myself for thinking abt barthes for fun~We are


In [8]:
vocab = sorted(set(text))

In [9]:
#vectorizing characters
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [10]:
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'mobius strips' ---- characters mapped to int ---- > [72 74 61 68 80 78  0 78 79 77 68 75 78]


In [11]:
#Length of a single input
seq_length = 20
examples_per_epoch = len(text)

#convert to tensor
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(i, ": ", idx2char[i.numpy()])

tf.Tensor(72, shape=(), dtype=int32) :  m
tf.Tensor(74, shape=(), dtype=int32) :  o
tf.Tensor(61, shape=(), dtype=int32) :  b
tf.Tensor(68, shape=(), dtype=int32) :  i
tf.Tensor(80, shape=(), dtype=int32) :  u


In [12]:
#Make batches of seq_length
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'mobius strips are ove'
'rrated theyre just br'
'acelets that u didnt '
'put on correctly~when'
' is the next long wee'


In [13]:
#duplicate and shift (make train and target)
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [14]:
#Given [0:99], predict [1:100] 
for input_example, target_example in dataset.take(1):
    print(repr(''.join(idx2char[input_example])))
    print(repr(''.join(idx2char[target_example])), "\n")

  This is separate from the ipykernel package so we can avoid doing imports until


IndexError: too many indices for array: array is 1-dimensional, but 20 were indexed

In [15]:
#visualization of training task for each rnn unit
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 72 ('m')
  expected output: 74 ('o')
Step    1
  input: 74 ('o')
  expected output: 61 ('b')
Step    2
  input: 61 ('b')
  expected output: 68 ('i')
Step    3
  input: 68 ('i')
  expected output: 80 ('u')
Step    4
  input: 80 ('u')
  expected output: 78 ('s')


In [16]:
batch_size = 64
buffer_size = 10000 #buffer size is for memory i think

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [17]:
#64 training examples of seq length 100
dataset

<BatchDataset shapes: ((64, 20), (64, 20)), types: (tf.int32, tf.int32)>

In [18]:
#build the thing (Personal task: change to LSTM to learn how to build ur own )
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, dropout=0.5),
        
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, dropout=0.5),
        
        tf.keras.layers.Dense(vocab_size) 
    ])
    
    return model

In [19]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 512

In [20]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

In [21]:
# 64 batch size, 100 seq length per batch, 86 vocab size (unique characters/tokens)
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 20, 87)


In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           22272     
_________________________________________________________________
lstm (LSTM)                  (64, None, 512)           1574912   
_________________________________________________________________
lstm_1 (LSTM)                (64, None, 512)           2099200   
_________________________________________________________________
dense (Dense)                (64, None, 87)            44631     
Total params: 3,741,015
Trainable params: 3,741,015
Non-trainable params: 0
_________________________________________________________________


### Training
With the setup of RNN, text generation will be treated like a basic supervised classification problem (given char x, predict char y), meaning we can use standard backprop rules and loss functions (cross entropy)

In [23]:
#loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [24]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)

In [25]:
#mean loss for the batch of 64 is 4.45
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 20, 87)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.466755


In [26]:
#Saving checkpoints (remember this lol)
import os

checkpoint_dir = 'D:/garbage_models/twitterbot'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

#saves weights in directory
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)

early_stop_callback = tf.keras.callbacks.EarlyStopping(patience=5, monitor="loss", mode="min")

In [27]:
#compile model (adam optimizer because why not, loss = loss function)
model.compile(optimizer="adam", loss=loss)

In [28]:
from IPython.display import clear_output
epochs = 100
#train the model (training set, number of epochs, callback for saving checkpoints)
for i in range (0, 20):
    clear_output()
    print(i*100)
    history = model.fit(dataset, epochs=epochs)

1900
Train for 58 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/1

In [29]:
# prediction step will use batch size 1 because rnns take in input by a fixed batch size
# so its necessary to run it with a different batch size of 1 since our task is to predict the next character
# tf.train.latest_checkpoint(checkpoint_dir)
model.save_weights('./textbot/model')

In [30]:
#import weights to model 
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
# model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.load_weights('./textbot/model')
model.build(tf.TensorShape([1, None])) 

In [31]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            22272     
_________________________________________________________________
lstm_2 (LSTM)                (1, None, 512)            1574912   
_________________________________________________________________
lstm_3 (LSTM)                (1, None, 512)            2099200   
_________________________________________________________________
dense_1 (Dense)              (1, None, 87)             44631     
Total params: 3,741,015
Trainable params: 3,741,015
Non-trainable params: 0
_________________________________________________________________


### Prediction Loop
Set starting string, <br>
Get next character using prediction distribution, <br>
Next character is used as input, <br>
repeat until size reached <br>
<br>
note that because an rnn has more info, the state changes and gets context from previous timesteps

In [66]:
def generate_text(model, start_string):
    #number of characters to generate
    num_generate = 300
    
    #vectorize start string to number
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0) #converts to tensor batch i think? 
    
    #results
    output = []
    
    #Adjust temperature for predictability (higher for more surprising text (idk either))
    temperature = 0.25
    
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        
        #remove batch dimension
        predictions = tf.squeeze(predictions, 0)
        
        #next character from categorical distribution
        predictions = predictions/temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        
        #preedicted character = next input
        input_eval = tf.expand_dims([predicted_id], 0)
        
        output.append(idx2char[predicted_id])
        
    return (start_string + ''.join(output))

In [50]:
generate_text(model, "~")

'~i get turned on by jojo meme pages back hurts again so i dont get regretation of a philosophy does not compel is a joke~i stay it doesnt know that theres of something for work~I was just mikiding a hiphop version of this month~how the whole day later i still dont notell us what the game just john le'

In [51]:
generate_text(model, "p")

'p;fbclid=IwAR2Pdh8Bk proves that the way sora wears ago but fuck its a problem of my attic regression this here to things on my best for saying the same pitfalls of joke i last 5 minutes after und more time in school~Plato string any progress with ur own mouse~Ppl are laughing about machine learning '

In [49]:
generate_text(model, "c")

"cause its a very good so ur gonna hear a lot of landmare is being able to afford in my see god is this wakanist all my activity is the rest of the word fast enough to thinks my cynicism associated w blood really bad~This my face with shitposts >*st i can say i'm unemployed wtf~ime to be excited about"

In [80]:
import random
generate_text(model, idx2char[random.randint(0,86)] + idx2char[random.randint(0,86)])

"t3 >realized that i didnt realize how course im not above all the devil works harder.~I wasnt talking about machine learning is easy just take the same way shinji's the logical equivat a universe that exhis. I hope you'll be make a Transformers are the tequila CODE, we love working with so many people"