# Shakespeare task
### Preparations

In [16]:
# Import required libraries
import numpy as np
import requests

import torch
import torch.nn as nn
import torch.optim as optim

In [17]:
# Import data
# URL of the Shakespeare dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

# Fetching the text data from the URL
response = requests.get(url)
text = response.text

# Printing the length of the fetched text
print("Length of text: ", len(text))

Length of text:  1115394


In [18]:
# Create shorter text for development
short = text[:10000]
print("Length of short text: ", len(short))

Length of short text:  10000


### Character mappings and sequences

In [19]:
# Create character mapping
chars = sorted(list(set(text)))
# chars = sorted(list(set(short)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [20]:
# Create sequences
# Initialise constants and lists
maxlen = 60
step = 7
sentences = []
next_chars = []

# Iterate
for i in range(0, len(short)-maxlen, step):
    sentences.append(short[i : i + maxlen])
    next_chars.append(short[i + maxlen])

### Vectorize the input data

In [21]:
# Vectorize the input data
# Samples as one-hot
X = torch.zeros((len(sentences), maxlen, len(chars)), dtype=torch.float32)
# Targets as class indices (not one-hot)
y = torch.zeros(len(sentences), dtype=torch.long)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i] = char_indices[next_chars[i]]

### Create the RNN model
Use .Dropout() method to randomly set 20% of neurons to zero with each forward pass. Apply .Dropout() to the input and hidden layers. The use of dropouts reduces the likelihood of overfitting.

Reference: https://machinelearningmastery.com/using-dropout-regularization-in-pytorch-models/

In [22]:
# Build the model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.dropout1 = nn.Dropout(0.2)
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.dropout2 = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, output_size)

    # The forward pass returns the hidden state for preservation in the 
    # generate function
    def forward(self, x, h=None):
        # apply dropout to input
        x = self.dropout1(x)
        out, h = self.rnn(x, h)
        # apply dropout to the last time-step's features before the classifier
        out = self.dropout2(out[:, -1, :])
        out = self.fc(out)
        return out, h

In [23]:
# Initialise the model
input_size = len(chars)
hidden_size = 128
output_size = len(chars)

model = RNNModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
num_epochs = 10

### Training phase

In [24]:
# Set hyperparameters
batch_size = 64

In [25]:
def train_net(model, num_epochs, batch_size, criterion, X, y):
    '''
    Train the specified neural network on the provided dataset using 
    mini-batches.
    Parameters:
        model = PyTorch nn.Module to be trained
        num_epochs = number of full passes over the dataset (int)
        batch_size = size of each mini-batch (int)
        criterion = loss function (callable, e.g., nn.CrossEntropyLoss())
        X = input tensor of shape [n_samples, seq_len, n_features]
        y = target tensor of shape [n_samples] containing class indices
    Outputs:
        model = the trained PyTorch model (nn.Module) returned after training.
    '''
    # Truncate X and y to give complete batches
    mod_X = len(X) % batch_size
    if mod_X != 0:
        X_trunc = X[: len(X) - mod_X]
        y_trunc = y[: len(y) - mod_X]
    else:
        X_trunc = X
        y_trunc = y

    # Iterate over the epochs
    for epoch in range(num_epochs):
        model.train()
        for i in range(0, len(X_trunc), batch_size):
            X_batch = X_trunc[i: i + batch_size]
            y_batch = y_trunc[i: i + batch_size]  # Vector of class indices

            # Forward pass
            outputs, _ = model(X_batch)  # shape: [batch_size, n_classes]

            # Compute loss
            loss = criterion(outputs, y_batch)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")
    return model

In [26]:
trained_model = train_net(model, num_epochs, batch_size, criterion, X, y)

Epoch 1/10, Loss: 3.4113
Epoch 2/10, Loss: 3.2572
Epoch 3/10, Loss: 3.0990
Epoch 4/10, Loss: 3.2728
Epoch 5/10, Loss: 2.8565
Epoch 6/10, Loss: 2.8707
Epoch 7/10, Loss: 2.7932
Epoch 8/10, Loss: 2.5360
Epoch 9/10, Loss: 2.8542
Epoch 10/10, Loss: 2.9526


### Generation phase

In my generate function below, I found that argmax (ie deterministic) prediction of the next_id lead to the model "sticking" on a single character. Example output: "I tell yotBBBBBBBBBBBBBBBBBBBBB".
The only solution I found was to switch from argmax and use a probabilistic approach combining softmax and a temperature hyperparameter.

The temperature hyperparameter scales the model logits before applying the softmax. If temperature T < 1 the distribution becomes sharper, the model is more deterministic, and the result approaches argmax. If T = 1, there is no scaling before the softmax. If temperature T > 1 the distribution is flattened and the output becomes less deterministic and more random. Rule of thumb: T = 0.5 is a conservative model, 0.8-1 creates a balanced model and 1.2-1.5 is a more creative model.

References: https://karpathy.github.io/2015/05/21/rnn-effectiveness/ ,   https://medium.com/@harshit158/softmax-temperature-5492e4007f71 ,   https://stackoverflow.com/questions/44365593/change-temperature-in-rnn-to-generate-text

##### Generate 100 characters

In [31]:
# Generate text (character-level)
def generate(
    model, start_text, length=100, temperature=1.0, deterministic=False
):
    '''
    Generate text with an RNN while preserving hidden state between steps.
    Parameters:
        model = PyTorch RNN model that returns (logits, hidden) from 
            (input, hidden)
        start_text = seed string used to prime the model
        length = number of characters to generate (int, default=100)
        temperature = float > 0; scales logits before softmax. Lower -> more 
            deterministic, higher -> more random
        deterministic = bool; if True use argmax, otherwise sample from the 
            softmax distribution
    Outputs:
        sample_text = generated string (seed + generated characters)
    '''
    model.eval()
    # initial input: one-hot encoding of the seed
    input = torch.zeros((1, len(start_text), len(chars)), dtype=torch.float32)
    for t, char in enumerate(start_text):
        input[0, t, char_indices[char]] = 1

    generated_text = [char_indices[char] for char in start_text]
    h = None

    # first step to get initial hidden layer state
    with torch.no_grad():
        outputs, h = model(input)  # outputs shape [1, n_classes]

        # loop for remaining characters
        for _ in range(length):
            if deterministic:
                next_id = torch.argmax(outputs[0]).item()
            else:
                probs = torch.softmax(
                    outputs[0] / temperature, dim=0
                ).cpu().numpy()
                next_id = np.random.choice(len(chars), p=probs)

            generated_text.append(next_id)

            # prepare next single-step input and continue with same hidden 
            # layer state
            input = torch.zeros((1, 1, len(chars)), dtype=torch.float32)
            input[0, 0, next_id] = 1
            outputs, h = model(input, h)

    sample_text = ''.join([indices_char[i] for i in generated_text])
    return sample_text

In [28]:
# Use the model to generate 100 characters of text
task_text = generate(
    trained_model, "I tell yo", 
    length=100, temperature=0.8, deterministic=False
)
print(task_text)

I tell yoine inots nn! coot Uise wheome ie meol cou  honeere thar tole py art  he bed serey worer yomeeeas, y


##### Generate 100 words
I was quite happy with my 100 characters, but on re-reading the task I realised 100 words was asked for. Rewrite the generate function for 100 words.

In [32]:
# Generate text (word-level)
def generate_words(
    model, start_text, num_words=100, temperature=1.0, deterministic=False
):
    '''
    Generate text with an RNN until a target number of words is reached.
    Parameters:
        model = PyTorch RNN model that returns (logits, hidden) from 
        (input, hidden)
        start_text = seed string used to prime the model
        num_words = number of words to generate (int, default=100). The final 
            output will contain at least this many words (including words in 
            start_text).
        temperature = float > 0; scales logits before softmax. Lower -> more 
            deterministic, higher -> more random
        deterministic = bool; if True use argmax, otherwise sample from the 
            softmax distribution
    Outputs:
        sample_text = generated string (seed + generated characters) 
            containing >= num_words words
    '''
    model.eval()
    # initial input: one-hot encoding of the seed
    input = torch.zeros((1, len(start_text), len(chars)), dtype=torch.float32)
    for t, char in enumerate(start_text):
        input[0, t, char_indices[char]] = 1

    generated_text = [char_indices[char] for char in start_text]
    h = None

    # first step to get initial hidden layer state
    with torch.no_grad():
        outputs, h = model(input)  # outputs shape [1, n_classes]

        # count words already in the seed
        sample_so_far = ''.join([indices_char[i] for i in generated_text])
        current_words = len([w for w in sample_so_far.split() if len(w) > 0])

        safety_limit = max(2000, num_words * 16)  # prevent infinite loops
        steps = 0

        while current_words < num_words and steps < safety_limit:
            if deterministic:
                next_id = torch.argmax(outputs[0]).item()
            else:
                probs = torch.softmax(
                    outputs[0] / temperature, dim=0
                ).cpu().numpy()
                next_id = np.random.choice(len(chars), p=probs)

            generated_text.append(next_id)

            # prepare next single-step input and continue with same hidden 
            # layer state
            input = torch.zeros((1, 1, len(chars)), dtype=torch.float32)
            input[0, 0, next_id] = 1
            outputs, h = model(input, h)

            steps += 1
            sample_so_far = ''.join([indices_char[i] for i in generated_text])
            current_words = len(
                [w for w in sample_so_far.split() if len(w) > 0]
            )

    sample_text = ''.join([indices_char[i] for i in generated_text])
    return sample_text

In [30]:
# Use the model to generate 100 words
task_text = generate_words(
    trained_model, "I tell yo", 
    num_words=100, temperature=0.8, deterministic=False
)
print(task_text)

I tell yoR'neeRoer rtse go d, stoe weow Ce? are w orere tett ise kenar s ol  we teat ain tis of t ee woos wherene pbore were se ted
Theeee! fa tar eine so e teror are y mar  a tor ir  tole s ee tee lentTlesleileoiney,
Wheyo nest ndeey the soe inemeron wheteobalhe ioire e fest toot we weite ist ss eot ohe we we,estrenol owos ale wene the norstoe teee he yee a pst oidepenoo fos ohere toocon he yeeeessakellert,
Thet? 
Mired the nono tecthe paos lictoore the goeoedst imale, what wine  he gor the sise,
Y


This RNN has only been trained for 10 epochs as an exercise. Further training is required to achieve an output closer to recognisable language.