In [78]:
import os 
import pandas as pd 
import tensorflow as tf 
import numpy as np

directory="C:/Users/sergi/Documents/Py"
os.chdir(directory )

In [79]:
#Getting data and storing
shakespeare_url="https://homl.info/shakespeare"
filepath=tf.keras.utils.get_file("shakespeare.txt",shakespeare_url)
with open(filepath) as f:
    shakespeare_text=f.read()

In [80]:
print(shakespeare_text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


**Vectorisation**

The text is getting vectorised with each word mapped to a token ID. 

In [81]:
text_vec_layer=tf.keras.layers.TextVectorization(split="character",standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded=text_vec_layer([shakespeare_text])[0]


n_tokens=text_vec_layer.vocabulary_size() 

dataset_size=len(encoded) 
print(f"total numer of chars: {dataset_size}")

total numer of chars: 1115394


**Dataset of windows**

Turning the training data into a dataset of windows in which a RNN can be use to train a model

In [82]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds=tf.data.Dataset.from_tensor_slices(sequence)  # Going to tensor to dataset
    ds=ds.window(length+1,shift=1,drop_remainder=True) # creating a window objects of length X. drop_remainder=True ensures the last window does not have extra elements  
    ds=ds.flat_map(lambda window_ds:window_ds.batch(length+1)) # Reconverting each window object into a tensor
    if shuffle==True:
        ds=ds.shuffle(buffer_size=100_000, seed=seed) #Shuffling the windows. Buffer size that is large ensures a better randomization.
    ds=ds.batch(batch_size) # The tensor dataset is grouped into batches of Y size
    return ds.map(lambda window:(window[:,:-1],window[:,1:])).prefetch(1) # map applied the funcion to the entire dataset. We are creating inputs and outputs sequences
#The obtained is a tf dataset of tensors with inputs that are sequenced of x len and outputs that seguences of x+1 len.
# In other words the outpust are the next character following the inputs. So if input is the text vectorized text[:100], output is text[1:101]

In [83]:
#Creating training, testimg and validation

#We are reducing the amount of data so it performs the computations faster

length=100
tf.random.set_seed(42)
train_set=to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=42)
valid_set=to_dataset(encoded[1_000_000:1_050_000], length=length)
test_set=to_dataset(encoded[1_050_000:], length=length)


**Building and training a Char-RNN  Model**

The model will predict the next character in a word. Such that with, hello: input: "hell", output: "o"

In [89]:
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16), # This is a 3D vector with input dim equaling the window lenghth, output dim and btach size
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(200, activation="relu"),
    tf.keras.layers.Dense(180, activation="relu"),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model_ckpt=tf.keras.callbacks.ModelCheckpoint("/shakespeare/shakesparemodel.keras",monitor="val_accuracy",save_best_only=True)
history=model.fit(train_set, validation_data=valid_set, epochs=3, callbacks=[model_ckpt])

Epoch 1/3
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2130s[0m 68ms/step - accuracy: 0.5695 - loss: 1.3957 - val_accuracy: 0.5268 - val_loss: 1.7576
Epoch 2/3
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2001s[0m 64ms/step - accuracy: 0.6365 - loss: 1.1442 - val_accuracy: 0.5311 - val_loss: 1.7898
Epoch 3/3
[1m31247/31247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2060s[0m 66ms/step - accuracy: 0.6416 - loss: 1.1249 - val_accuracy: 0.5310 - val_loss: 1.8104


In [99]:
test_loss, test_accuracy = model.evaluate(test_set)

[1m2041/2041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 15ms/step - accuracy: 0.5307 - loss: 1.8285


**Generating fake shakespearean text**

Weare sampling the the next character randomly with a probability equal to the estimated model probability.
In this sense we are feeding into the text the estimated character with a certain probability of being chosen the one with the highest softmax output. 
This certain probability is called temperature and the higher the more likely to get the top probability output of the softmax.

In [None]:
log_probs=tf.math.log([[0.4,0.5,0.1]])# Probabilities of 40, 50 and 10
tf.random.set_seed(42)
tf.random.categorical(log_probs, num_samples=8 )# We are drawing 8 random samples


In [None]:
def netx_char(text, temperature=1):
    y_proba=model_prob(text, model)
    rescaled_logits=tf.math.log(y_proba)/temperature
    char_id=tf.random.categorical(rescaled_logits, num_samples=1)[0,0]
    return text_vec_layer.get_vocabulary()[char_id+2]
##This function gets the ouput of the RNN and then takes randomly one of the outputs (highest temperature more random) and then gives us the character in form of token id.  Finally looks for the tokenn id in the vocab vectord and returns the correct character. 

def extended_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text+=next_char(text, temperature)
    return text
#This function generates text of len 50 characters

    

In [None]:
tf.random.set_seed(42)
print(extended_text("To be or not to be"), temperature=0.01)