In [10]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import tensorflow as tf
import math
import optuna
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from datasets import Dataset

#### Read data

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
with open(path_to_file, 'r') as file:
    text = file.read()

In [11]:
chars = sorted(set(text))
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

# encode the text
encoded_text = np.array([char_to_idx[ch] for ch in text])

# Define parameters
seq_length = 100
batch_size = 64

def get_batches(encoded_text, batch_size, seq_length):
    total_length = len(encoded_text)
    num_batches = total_length // (batch_size * seq_length)

    encoded_text = encoded_text[:num_batches * batch_size * seq_length]
    encoded_text = encoded_text.reshape((batch_size, -1))

    for i in range(0, encoded_text.shape[1], seq_length):
        x = encoded_text[:, i:i+seq_length]
        y = np.zeros_like(x)
        if i+seq_length < encoded_text.shape[1]:
            y[:, :-1], y[:, -1] = x[:, 1:], encoded_text[:, i+seq_length]
        else:
            y[:, :-1], y[:, -1] = x[:, 1:], encoded_text[:, 0]
        yield torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

In [12]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, dropout=0.5):
        super(CharRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.rnn(x, hidden)
        out = out.reshape(out.size(0)*out.size(1), self.hidden_size)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                weight.new(self.num_layers, batch_size, self.hidden_size).zero_())


In [5]:
# hyperparameters
vocab_size = len(chars)
hidden_size = 256
num_layers = 2
num_epochs = 20
learning_rate = 0.002

# GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model, optimizer, and loss function
model = CharRNN(vocab_size, hidden_size, num_layers).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training loop
for epoch in range(num_epochs):
    hidden = model.init_hidden(batch_size)
    hidden = tuple([each.data.to(device) for each in hidden])

    for i, (x, y) in enumerate(get_batches(encoded_text, batch_size, seq_length)):
        x, y = x.to(device), y.to(device)
        hidden = tuple([each.data for each in hidden])
        model.zero_grad()
        output, hidden = model(x, hidden)
        loss = criterion(output, y.view(-1))
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f'Epoch: {epoch+1}/{num_epochs}, Step: {i}, Loss: {loss.item()}')


Epoch: 1/20, Step: 0, Loss: 4.169116497039795
Epoch: 1/20, Step: 100, Loss: 2.100273370742798
Epoch: 2/20, Step: 0, Loss: 1.9322454929351807
Epoch: 2/20, Step: 100, Loss: 1.7591819763183594
Epoch: 3/20, Step: 0, Loss: 1.7280583381652832
Epoch: 3/20, Step: 100, Loss: 1.6501561403274536
Epoch: 4/20, Step: 0, Loss: 1.6433299779891968
Epoch: 4/20, Step: 100, Loss: 1.5912357568740845
Epoch: 5/20, Step: 0, Loss: 1.5942225456237793
Epoch: 5/20, Step: 100, Loss: 1.5429699420928955
Epoch: 6/20, Step: 0, Loss: 1.5584497451782227
Epoch: 6/20, Step: 100, Loss: 1.5137763023376465
Epoch: 7/20, Step: 0, Loss: 1.5180014371871948
Epoch: 7/20, Step: 100, Loss: 1.491997241973877
Epoch: 8/20, Step: 0, Loss: 1.505213975906372
Epoch: 8/20, Step: 100, Loss: 1.4864345788955688
Epoch: 9/20, Step: 0, Loss: 1.5005873441696167
Epoch: 9/20, Step: 100, Loss: 1.4643430709838867
Epoch: 10/20, Step: 0, Loss: 1.4700899124145508
Epoch: 10/20, Step: 100, Loss: 1.4506616592407227
Epoch: 11/20, Step: 0, Loss: 1.44532334804

In [6]:
def generate_text(model, start_str, length, hidden_size, num_layers):
    model.eval()
    chars = [char_to_idx[ch] for ch in start_str]
    input = torch.tensor(chars, dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(1)
    hidden = tuple([each.data.to(device) for each in hidden])

    generated_text = start_str
    for i in range(length):
        output, hidden = model(input, hidden)
        p = torch.nn.functional.softmax(output, dim=1).data
        top_ch = torch.multinomial(p, 1)[-1]
        char = idx_to_char[top_ch.item()]
        generated_text += char
        input = torch.tensor([top_ch], dtype=torch.long).unsqueeze(0).to(device)

    return generated_text

In [7]:
print(generate_text(model, start_str="To be, or not to be", length=100, hidden_size=hidden_size, num_layers=num_layers))

To be, or not to be.
Romeo, to grant the king knows not to my time.
Sirrat that the sun a lord that making, last
So Cap


In [8]:
def calculate_perplexity(model, data, seq_length, batch_size):
    model.eval()
    total_loss = 0
    criterion = nn.CrossEntropyLoss()

    hidden = model.init_hidden(batch_size)
    hidden = tuple([each.data.to(device) for each in hidden])

    with torch.no_grad():
        for x, y in get_batches(data, batch_size, seq_length):
            x, y = x.to(device), y.to(device)
            output, hidden = model(x, hidden)
            loss = criterion(output, y.view(-1))
            total_loss += loss.item()

    avg_loss = total_loss / (len(data) / (batch_size * seq_length))
    perplexity = math.exp(avg_loss)
    return perplexity

In [9]:
# split the dataset into training and validation sets
split_index = int(len(encoded_text) * 0.9)
train_data = encoded_text[:split_index]
val_data = encoded_text[split_index:]

# train and evaluate the model
for epoch in range(num_epochs):
    model.train()
    hidden = model.init_hidden(batch_size)
    hidden = tuple([each.data.to(device) for each in hidden])

    for i, (x, y) in enumerate(get_batches(train_data, batch_size, seq_length)):
        x, y = x.to(device), y.to(device)
        hidden = tuple([each.data for each in hidden])
        model.zero_grad()
        output, hidden = model(x, hidden)
        loss = criterion(output, y.view(-1))
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f'Epoch: {epoch+1}/{num_epochs}, Step: {i}, Loss: {loss.item()}')

    # evaluate on validation set
    val_perplexity = calculate_perplexity(model, val_data, seq_length, batch_size)
    print(f'Epoch: {epoch+1}, Validation Perplexity: {val_perplexity}')
    print(generate_text(model, start_str="To be, or not to be", length=100, hidden_size=hidden_size, num_layers=num_layers))

Epoch: 1/20, Step: 0, Loss: 1.3977503776550293
Epoch: 1/20, Step: 100, Loss: 1.2753419876098633
Epoch: 1, Validation Perplexity: 3.5572266386786717
To be, or not to be,
When my masters doth show the such any coover thee:
Here is feel, when we were despair:
He was alo
Epoch: 2/20, Step: 0, Loss: 1.377517580986023
Epoch: 2/20, Step: 100, Loss: 1.263879418373108
Epoch: 2, Validation Perplexity: 3.582075532662698
To be, or not to be
rise more seven frowned in thyself but bring.
O bloody daughter, his severe, my lord;
And yet I may
Epoch: 3/20, Step: 0, Loss: 1.367159128189087
Epoch: 3/20, Step: 100, Loss: 1.2707512378692627
Epoch: 3, Validation Perplexity: 3.6066422324131526
To be, or not to be them.

DUKE VINCENTIO:
I women may not made the pleasant--

SICINIUS:
Yes, Catesby?

EXETER:
O, now
Epoch: 4/20, Step: 0, Loss: 1.3637961149215698
Epoch: 4/20, Step: 100, Loss: 1.2625248432159424
Epoch: 4, Validation Perplexity: 3.6301358618108712
To be, or not to be
disposition; if which illy upon 

#### Printing probabilities and preplexities (side-by-side)

In [16]:
def calculate_perplexity(model, data, seq_length, batch_size):
    model.eval()
    total_loss = 0
    criterion = nn.CrossEntropyLoss()

    hidden = model.init_hidden(batch_size)
    hidden = tuple([each.data.to(device) for each in hidden])

    with torch.no_grad():
        for x, y in get_batches(data, batch_size, seq_length):
            x, y = x.to(device), y.to(device)
            output, hidden = model(x, hidden)
            loss = criterion(output, y.view(-1))
            total_loss += loss.item()

    avg_loss = total_loss / (len(data) / (batch_size * seq_length))
    perplexity = math.exp(avg_loss)
    return perplexity

def print_text_with_probabilities_and_perplexity(model, start_str, length, hidden_size, num_layers):
    model.eval()
    chars = [char_to_idx[ch] for ch in start_str]
    input = torch.tensor(chars, dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(1)
    hidden = tuple([each.data.to(device) for each in hidden])

    generated_text = start_str
    total_log_prob = 0
    for i in range(length):
        output, hidden = model(input, hidden)
        p = torch.nn.functional.softmax(output, dim=1).data
        top_ch = torch.multinomial(p, 1)[-1]
        char = idx_to_char[top_ch.item()]
        char_probability = p[0][top_ch].item()
        total_log_prob += math.log(char_probability)
        generated_text += char
        print(f"Character: {char}, Probability: {char_probability:.4f}")
        input = torch.tensor([top_ch], dtype=torch.long).unsqueeze(0).to(device)

    avg_log_prob = total_log_prob / length
    perplexity = math.exp(-avg_log_prob)
    print(f"Generated Text Perplexity: {perplexity:.4f}")
    return generated_text


In [17]:
print_text_with_probabilities_and_perplexity(model, start_str="To be, or not to be", 
                                             length=100, hidden_size=hidden_size, num_layers=num_layers)

Character: ., Probability: 0.0007
Character: 
, Probability: 0.9652
Character: 
, Probability: 0.6088
Character: H, Probability: 0.0428
Character: A, Probability: 0.2716
Character: S, Probability: 0.9971
Character: T, Probability: 0.9999
Character: I, Probability: 1.0000
Character: N, Probability: 1.0000
Character: G, Probability: 0.9994
Character: S, Probability: 1.0000
Character: :, Probability: 0.9999
Character: 
, Probability: 0.9999
Character: I, Probability: 0.1647
Character:  , Probability: 0.5984
Character: w, Probability: 0.1994
Character: i, Probability: 0.5728
Character: l, Probability: 0.9460
Character: l, Probability: 0.9993
Character:  , Probability: 0.9091
Character: a, Probability: 0.0262
Character: s, Probability: 0.0926
Character: s, Probability: 0.2914
Character: a, Probability: 0.1474
Character: m, Probability: 0.0328
Character: e, Probability: 0.9970
Character:  , Probability: 0.9201
Character: t, Probability: 0.3112
Character: o, Probability: 0.5220
Character:  , 

"To be, or not to be.\n\nHASTINGS:\nI will assame to the exile thou should be most\nhair for his death, as we remain to't.\n\n"

### Experiment 1: using GPT-2 model from the transformers library

In [20]:
# load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

model.eval()

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [21]:
# generate text using GPT-2
def generate_text_gpt2(model, tokenizer, start_str, length):
    # Encode the input text
    input_ids = tokenizer.encode(start_str, return_tensors='pt')

    # Generate text
    with torch.no_grad():
        output = model.generate(input_ids, max_length=length, temperature=0.7, top_k=50, top_p=0.9, num_return_sequences=1)
    
    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return generated_text

In [22]:
# generate text with GPT-2
start_str = "To be, or not to be"
generated_text = generate_text_gpt2(model, tokenizer, start_str, length=200)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


To be, or not to be, a member of the Church of Jesus Christ of Latter-day Saints, I am not a member of the Church of Jesus Christ of Latter-day Saints. I am not a member of the Church of Jesus Christ of Latter-day Saints. I am not a member of the Church of Jesus Christ of Latter-day Saints. I am not a member of the Church of Jesus Christ of Latter-day Saints. I am not a member of the Church of Jesus Christ of Latter-day Saints. I am not a member of the Church of Jesus Christ of Latter-day Saints. I am not a member of the Church of Jesus Christ of Latter-day Saints. I am not a member of the Church of Jesus Christ of Latter-day Saints. I am not a member of the Church of Jesus Christ of Latter-day Saints. I am not a member of the Church of Jesus Christ of Latter-day Saints. I am not a member of the Church


In [23]:
# calculate perplexity for the generated text
def calculate_perplexity_gpt2(model, tokenizer, text):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        perplexity = torch.exp(loss).item()
    return perplexity

In [24]:
example_text = "To be, or not to be, that is the question."
perplexity = calculate_perplexity_gpt2(model, tokenizer, example_text)
print(f"Perplexity: {perplexity:.4f}")

Perplexity: 18.4079


#### Experiment 2 : Hyper-parameter tuning 

In [28]:
chars = sorted(set(text))
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

encoded_text = np.array([char_to_idx[ch] for ch in text])

seq_length = 100
batch_size = 64

def get_batches(encoded_text, batch_size, seq_length):
    total_length = len(encoded_text)
    num_batches = total_length // (batch_size * seq_length)

    encoded_text = encoded_text[:num_batches * batch_size * seq_length]
    encoded_text = encoded_text.reshape((batch_size, -1))

    for i in range(0, encoded_text.shape[1], seq_length):
        x = encoded_text[:, i:i+seq_length]
        y = np.zeros_like(x)
        if i+seq_length < encoded_text.shape[1]:
            y[:, :-1], y[:, -1] = x[:, 1:], encoded_text[:, i+seq_length]
        else:
            y[:, :-1], y[:, -1] = x[:, 1:], encoded_text[:, 0]
        yield torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

class CharRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, dropout=0.5):
        super(CharRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.rnn(x, hidden)
        out = out.reshape(out.size(0)*out.size(1), self.hidden_size)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                weight.new(self.num_layers, batch_size, self.hidden_size).zero_())

def calculate_perplexity(model, data, seq_length, batch_size, device):
    model.eval()
    total_loss = 0
    criterion = nn.CrossEntropyLoss()

    hidden = model.init_hidden(batch_size)
    hidden = tuple([each.data.to(device) for each in hidden])

    with torch.no_grad():
        for x, y in get_batches(data, batch_size, seq_length):
            x, y = x.to(device), y.to(device)
            output, hidden = model(x, hidden)
            loss = criterion(output, y.view(-1))
            total_loss += loss.item()

    avg_loss = total_loss / (len(data) / (batch_size * seq_length))
    perplexity = math.exp(avg_loss)
    return perplexity

In [29]:
# hyperparameter optimization function
def objective(trial):
    hidden_size = trial.suggest_int('hidden_size', 128, 512)
    num_layers = trial.suggest_int('num_layers', 1, 4)
    dropout = trial.suggest_float('dropout', 0.2, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # model, optimizer, and loss function
    model = CharRNN(len(chars), hidden_size, num_layers, dropout).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    split_index = int(len(encoded_text) * 0.9)
    train_data = encoded_text[:split_index]
    val_data = encoded_text[split_index:]

    num_epochs = 3 
    batch_size = 32  

    # training loop
    for epoch in range(num_epochs):
        model.train()
        hidden = model.init_hidden(batch_size)
        hidden = tuple([each.data.to(device) for each in hidden])

        for i, (x, y) in enumerate(get_batches(train_data, batch_size, seq_length)):
            x, y = x.to(device), y.to(device)
            hidden = tuple([each.data for each in hidden])
            model.zero_grad()
            output, hidden = model(x, hidden)
            loss = criterion(output, y.view(-1))
            loss.backward()
            optimizer.step()

    val_perplexity = calculate_perplexity(model, val_data, seq_length, batch_size, device)
    return val_perplexity

split_index = int(len(encoded_text) * 0.9)
train_data = encoded_text[:split_index]
val_data = encoded_text[split_index:]


In [30]:
# optimize hyperparameters using Optuna
try:
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=10) 

    # best hyperparameters
    print(f'Best hyperparameters: {study.best_params}')
except Exception as e:
    print(f"An error occurred during the hyperparameter optimization: {e}")

[I 2024-05-24 20:37:57,004] A new study created in memory with name: no-name-b6f13f6d-400c-4ce9-ae7f-a79593e70e9b
[I 2024-05-24 20:42:18,783] Trial 0 finished with value: 5.686259741489859 and parameters: {'hidden_size': 301, 'num_layers': 3, 'dropout': 0.4130703337189342, 'learning_rate': 0.0007312102034508422}. Best is trial 0 with value: 5.686259741489859.
[I 2024-05-24 20:43:46,536] Trial 1 finished with value: 4.777773780099476 and parameters: {'hidden_size': 289, 'num_layers': 1, 'dropout': 0.32888205280417027, 'learning_rate': 0.006239549852964147}. Best is trial 1 with value: 4.777773780099476.
[I 2024-05-24 20:45:57,327] Trial 2 finished with value: 4.615608849778578 and parameters: {'hidden_size': 443, 'num_layers': 1, 'dropout': 0.41065148460888434, 'learning_rate': 0.001490822769789604}. Best is trial 2 with value: 4.615608849778578.
[I 2024-05-24 20:47:50,449] Trial 3 finished with value: 5.785774543500441 and parameters: {'hidden_size': 172, 'num_layers': 3, 'dropout': 0.

Best hyperparameters: {'hidden_size': 443, 'num_layers': 1, 'dropout': 0.41065148460888434, 'learning_rate': 0.001490822769789604}


In [31]:
# train final model with best hyperparameters
best_hidden_size = study.best_params['hidden_size']
best_num_layers = study.best_params['num_layers']
best_dropout = study.best_params['dropout']
best_learning_rate = study.best_params['learning_rate']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = CharRNN(len(chars), best_hidden_size, best_num_layers, best_dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)
criterion = nn.CrossEntropyLoss()

In [32]:
# training loop for final model
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    hidden = model.init_hidden(batch_size)
    hidden = tuple([each.data.to(device) for each in hidden])

    for i, (x, y) in enumerate(get_batches(train_data, batch_size, seq_length)):
        x, y = x.to(device), y.to(device)
        hidden = tuple([each.data for each in hidden])
        model.zero_grad()
        output, hidden = model(x, hidden)
        loss = criterion(output, y.view(-1))
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f'Epoch: {epoch+1}/{num_epochs}, Step: {i}, Loss: {loss.item()}')

    # evaluate on validation set
    val_perplexity = calculate_perplexity(model, val_data, seq_length, batch_size, device)
    print(f'Epoch: {epoch+1}, Validation Perplexity: {val_perplexity}')



Epoch: 1/20, Step: 0, Loss: 4.1894917488098145
Epoch: 1/20, Step: 100, Loss: 1.8075377941131592
Epoch: 1, Validation Perplexity: 5.813567866657465
Epoch: 2/20, Step: 0, Loss: 1.683000922203064
Epoch: 2/20, Step: 100, Loss: 1.514767050743103
Epoch: 2, Validation Perplexity: 5.057815673916474
Epoch: 3/20, Step: 0, Loss: 1.5278393030166626
Epoch: 3/20, Step: 100, Loss: 1.4161667823791504
Epoch: 3, Validation Perplexity: 4.754670684331333
Epoch: 4/20, Step: 0, Loss: 1.459116816520691
Epoch: 4/20, Step: 100, Loss: 1.3566111326217651
Epoch: 4, Validation Perplexity: 4.600151532055493
Epoch: 5/20, Step: 0, Loss: 1.417265772819519
Epoch: 5/20, Step: 100, Loss: 1.315069556236267
Epoch: 5, Validation Perplexity: 4.503432373073049
Epoch: 6/20, Step: 0, Loss: 1.3874467611312866
Epoch: 6/20, Step: 100, Loss: 1.286935567855835
Epoch: 6, Validation Perplexity: 4.446841592578943
Epoch: 7/20, Step: 0, Loss: 1.3609236478805542
Epoch: 7/20, Step: 100, Loss: 1.2627443075180054
Epoch: 7, Validation Perplex

In [33]:
def print_text_with_probabilities_and_perplexity(model, start_str, length, hidden_size, num_layers, device):
    model.eval()
    chars = [char_to_idx[ch] for ch in start_str]
    input = torch.tensor(chars, dtype=torch.long).unsqueeze(0).to(device)
    hidden = model.init_hidden(1)
    hidden = tuple([each.data.to(device) for each in hidden])

    generated_text = start_str
    total_log_prob = 0
    for i in range(length):
        output, hidden = model(input, hidden)
        p = torch.nn.functional.softmax(output, dim=1).data
        top_ch = torch.multinomial(p, 1)[-1]
        char = idx_to_char[top_ch.item()]
        char_probability = p[0][top_ch].item()
        total_log_prob += math.log(char_probability)
        generated_text += char
        print(f"Character: {char}, Probability: {char_probability:.4f}")
        input = torch.tensor([top_ch], dtype=torch.long).unsqueeze(0).to(device)

    avg_log_prob = total_log_prob / length
    perplexity = math.exp(-avg_log_prob)
    print(f"Generated Text Perplexity: {perplexity:.4f}")
    return generated_text

In [34]:
print_text_with_probabilities_and_perplexity(model, start_str="To be, or not to be", 
                                             length=100, hidden_size=best_hidden_size, 
                                             num_layers=best_num_layers, device=device)

Character:  , Probability: 0.0002
Character: m, Probability: 0.0890
Character: u, Probability: 0.0972
Character: r, Probability: 0.1869
Character: d, Probability: 0.9897
Character: e, Probability: 0.9903
Character: r, Probability: 0.9903
Character: e, Probability: 0.5230
Character: d, Probability: 0.5427
Character: ,, Probability: 0.1741
Character: 
, Probability: 0.8662
Character: Y, Probability: 0.0363
Character: o, Probability: 0.8318
Character: u, Probability: 0.9888
Character: n, Probability: 0.0208
Character: g, Probability: 0.9904
Character: s, Probability: 0.0334
Character:  , Probability: 0.7315
Character: a, Probability: 0.3643
Character: n, Probability: 0.7184
Character: d, Probability: 0.9921
Character:  , Probability: 0.9980
Character: s, Probability: 0.0683
Character: i, Probability: 0.1281
Character: t, Probability: 0.0368
Character: t, Probability: 0.2653
Character: i, Probability: 0.2837
Character: n, Probability: 0.9974
Character: g, Probability: 0.9665
Character:  , 

'To be, or not to be murdered,\nYoungs and sitting fair-lustors will shame since,\nAnd hear he distressed with trouble.\n\nC'