In [2]:
from __future__ import annotations

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from helper import (
    start_time,
    time_since,
    ShakespeareDataset,
    TokenMapping,
    build_model,
    next_token,
    # Character-based helpers
    encode_text,
    # Subword-based helpers
    encode_text_from_tokenizer,
    tokenize_text_from_tokenizer,
)

In [4]:
# Deterministic training
torch.manual_seed(0)

# Attempt GPU; if not, stay on CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [5]:
# Reduced data to make it manageable for smaller systems
DATA_FILE: str = 'data/shakespeare_small.txt'

In [6]:
with open(DATA_FILE, 'r') as data_file:
    raw_text = data_file.read()

print(f'Number of characters in text file: {len(raw_text):,}')

Number of characters in text file: 50,085


In [9]:
# Normalization of text
def normalize_text(text: str) -> str:
    normalized_text = text.lower()
    return normalized_text

In [10]:
normalized_text = normalize_text(raw_text[:500])
print(normalized_text)

first citizen:
before we proceed any further, hear me speak.

all:
speak, speak.

first citizen:
you are all resolved rather to die than to famish?

all:
resolved. resolved.

first citizen:
first, you know caius marcius is chief enemy to the people.

all:
we know't, we know't.

first citizen:
let us kill him, and we'll have corn at our own price.
is't a verdict?

all:
no more talking on't; let it be done: away, away!

second citizen:
one word, good citizens.

first citizen:
we are accounted poor


In [11]:
def pretokenize_text(text: str) -> str | list[str]:
    smaller_pieces = [char for char in text]
    return smaller_pieces

In [12]:
pretokenized_text = pretokenize_text(normalized_text)
print(pretokenized_text)

['f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ',', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'a', 'l', 'l', ':', '\n', 's', 'p', 'e', 'a', 'k', ',', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'y', 'o', 'u', ' ', 'a', 'r', 'e', ' ', 'a', 'l', 'l', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', ' ', 'r', 'a', 't', 'h', 'e', 'r', ' ', 't', 'o', ' ', 'd', 'i', 'e', ' ', 't', 'h', 'a', 'n', ' ', 't', 'o', ' ', 'f', 'a', 'm', 'i', 's', 'h', '?', '\n', '\n', 'a', 'l', 'l', ':', '\n', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'f', 'i', 'r', 's', 't', ',', ' ', '

In [13]:
# Combine normalization and pretokenization
def tokenize_text(text: str) -> str | list[str]:
    normalized_text = normalize_text(text)
    pretokenized_text = pretokenize_text(normalized_text)
    return pretokenized_text

In [14]:
tokenized_text = tokenize_text(raw_text[:500])
print(tokenized_text)

['f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ',', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'a', 'l', 'l', ':', '\n', 's', 'p', 'e', 'a', 'k', ',', ' ', 's', 'p', 'e', 'a', 'k', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'y', 'o', 'u', ' ', 'a', 'r', 'e', ' ', 'a', 'l', 'l', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', ' ', 'r', 'a', 't', 'h', 'e', 'r', ' ', 't', 'o', ' ', 'd', 'i', 'e', ' ', 't', 'h', 'a', 'n', ' ', 't', 'o', ' ', 'f', 'a', 'm', 'i', 's', 'h', '?', '\n', '\n', 'a', 'l', 'l', ':', '\n', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', ' ', 'r', 'e', 's', 'o', 'l', 'v', 'e', 'd', '.', '\n', '\n', 'f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', '\n', 'f', 'i', 'r', 's', 't', ',', ' ', '

In [15]:
# Postprocessing the text
# We will skip since the sentence doesn't have any special tokens we want to consider for the task

In [16]:
# Encode (Tokens to Integer IDs)
encoded_text, character_mapping = encode_text(raw_text, tokenize_text)

In [17]:
n_tokens = character_mapping.n_tokens
dataset_size = len(encoded_text)
print(f'Size of dataset: {dataset_size:,} characters')
print(f'Number of unique tokens: {n_tokens:,}')

Size of dataset: 50,086 characters
Number of unique tokens: 37


In [18]:
# Defining sequence length that will be taken in at a time by our model
sequence_length = 32 # Number of characters
batch_size = 32

train_dataset = ShakespeareDataset(encoded_text, sequence_length)
train_loader = DataLoader(
    train_dataset,
    shuffle=False, # Ensure deterministic training
    batch_size=batch_size,
)

In [19]:
# Define the model class
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [23]:
# Define text generation
def generate_text_by_char(
        input_str: str,
        model,
        token_mapping: TokenMapping = character_mapping,
        num_chars: int = 100,
        temperature: float = 1.0,
        topk: int | None = None,
) -> str:
    # Uses the character based tokenizer
    tokenized_text: list[str] = tokenize_text(input_str)
    # Generates token-by-token and creates a list of those tokens
    generated_tokens = []
    for _ in range(num_chars):
        # Uses the input text and generated text (so far) to get next token
        new_char = next_token(tokenized_text=(tokenized_text + generated_tokens),
                              model=model,
                              token_mapping=token_mapping,
                              temperature=temperature,
                              topk=topk,
                              device=device,
        )
        generated_tokens.append(new_char)
    # Returns input string plus the full generated string (of generated tokens)
    full_text = ''.join(tokenized_text + generated_tokens)
    return full_text

In [24]:
# Train the model
TEST_PHRASE = 'To be, or not to be'
epochs = 5 if device == 'cpu' else 25

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print(f'[{time_since(start)} ({epoch} {epoch / epochs * 100}) {loss:.4f}]')
    print('-'*72)
    gen_output = generate_text_by_char(
        input_str=TEST_PHRASE,
        model=model,
        num_chars=100,
    )
    print(gen_output)

Epoch 1/25, Loss: 2.5301661688679706
[00m 1.8s (0 0.0) 2.1864]
------------------------------------------------------------------------
to be, or not to bee and vevenptury havir
if:
comums mey:
oucidy
 irastr mavif,
met to gow ow, it!citisere-TOKEN_NOT_FOUNDctim thongi
Epoch 2/25, Loss: 2.181257638154319
[00m 3.4s (1 4.0) 1.9870]
------------------------------------------------------------------------
to be, or not to bem fiese have an, ato shind thai daraster harfe uss peiind aftorfer;
hins, inius:
ine deart haet of o
Epoch 3/25, Loss: 2.0785382084191415
[00m 4.8s (2 8.0) 1.8849]
------------------------------------------------------------------------
to be, or not to bevis sinsnores
in coak foar.
there cont rye sel cot cim andp, gry llott hal and aliendry no; pirse mo
Epoch 4/25, Loss: 2.019214431774883
[00m 6.3s (3 12.0) 1.8198]
------------------------------------------------------------------------
to be, or not to be thies and the bericenyly he 'ut nopher iaret; do ltim uncius:ia

In [None]:
# Generate text using the trained model
print('Generated text:')
op = generate_text_by_char(
    input_str=TEST_PHRASE,
    model=model,
    num_chars=100,
    temperature=1.0,
    topk=None,
)
print(op)

In [7]:
model_name = 'bert-base-uncased'

my_tokenizer = AutoTokenizer.from_pretrained(
    model_name,
)

In [21]:
encoded_text, token_mapping = encode_text_from_tokenizer(
    text=raw_text,
    tokenizer=my_tokenizer,
)

In [22]:
n_tokens = token_mapping.n_tokens
dataset_size = len(encoded_text)
print(f'Size of dataset: {dataset_size:,} tokens')

Size of dataset: 13,139 tokens


In [23]:
# Defining sequence length that will be taken in at a time by our model
sequence_length = 32 # Number of tokens
batch_size = 32

train_dataset = ShakespeareDataset(encoded_text, sequence_length)
train_loader = DataLoader(
    train_dataset,
    shuffle=False, # Ensure deterministic training
    batch_size=batch_size,
)

In [24]:
# Defining the model to be trained and generate text with
model = build_model(n_tokens)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [25]:
def generate_text_by_subword(
    input_str: str,
    model,
    token_mapping: TokenMapping = token_mapping,
    tokenizer = my_tokenizer,
    num_tokens: int = 100,
    temperature: float = 1.0,
    topk: int | None = None,
) -> str:
    # Use your chosen subword-tokenizer
    tokenized_text = tokenize_text_from_tokenizer(
        tokenizer=tokenizer,
        text=input_str,
    )
    # Generates token-by-token and creates a list of those tokens
    generated_tokens = []
    for _ in range(num_tokens):
        # Uses the input text and generated text (so far) to get next token
        new_token = next_token(
            tokenized_text=(tokenized_text + generated_tokens),
            model=model,
            token_mapping=token_mapping,
            # Temperature & top-k sampling used in determining the next token
            temperature=temperature,
            topk=topk,
            device=device,
        )
        generated_tokens.append(new_token)
    # List of all token IDs (input text and generated text)
    output_ids = tokenizer.convert_tokens_to_ids(
        tokenized_text + generated_tokens
    )
    # Returns input string plus the full generated string from list of token IDs
    full_text = tokenizer.decode(output_ids)
    return full_text

In [26]:
TEST_PHRASE = 'To be or not to be'
# Use more epochs if not CPU device
epochs = 5 if device == 'cpu' else 25

start = start_time()
for epoch in range(epochs):
    # Set model into "training mode"
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch.to(device))
        loss = criterion(output.transpose(1, 2), y_batch.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')
    print(f'[{time_since(start)} ({epoch} {epoch / epochs * 100}) {loss:.4f}]')
    print('-'*72)
    output = generate_text_by_subword(
        input_str=TEST_PHRASE,
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=30,
        temperature=1.0,
    )
    print(output)

Epoch 1/25, Loss: 6.524220229358208
[00m 0.7s (0 0.0) 5.5130]
------------------------------------------------------------------------
to be or not to be themselves, underli, the in!h lead best?. pound : he the five freedom marc :. four pray him body, so not v
Epoch 2/25, Loss: 5.837869755814715
[00m 1.5s (1 4.0) 4.9805]
------------------------------------------------------------------------
to be or not to bemity yet though, even, my in o to great forrd? menus no promise take s part so - turnus where ' guess - to
Epoch 3/25, Loss: 5.534513481651865
[00m 2.1s (2 8.0) 4.5549]
------------------------------------------------------------------------
to be or not to be - know and ; sin, old bold daughter good ye to liketino now my change ' it meural neither south. br years nor some sicidi
Epoch 4/25, Loss: 5.283198249630812
[00m 2.6s (3 12.0) 4.3330]
------------------------------------------------------------------------
to be or not to be armies know br menenius wallsius ratesus he wors

In [31]:
output = generate_text_by_subword(
        input_str='To be or not to be',
        model=model,
        token_mapping=token_mapping,
        tokenizer=my_tokenizer,
        num_tokens=30,
        temperature=0.1,
        topk=100,
    )
print(output)

to be or not to be them of the people, and the gods doom him. sicinius : i ' ll beat, i am glad, and the gods crown. sic
