In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path

# Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

text = Path('../../tiny-shakespeare.txt').read_text(encoding='utf-8')

In [30]:
print(text[0:1000])

    THE SONNETS
    ALL’S WELL THAT ENDS WELL
    THE TRAGEDY OF ANTONY AND CLEOPATRA
    AS YOU LIKE IT
    THE COMEDY OF ERRORS
    THE TRAGEDY OF CORIOLANUS
    CYMBELINE
    THE TRAGEDY OF HAMLET, PRINCE OF DENMARK
    THE FIRST PART OF KING HENRY THE FOURTH
    THE SECOND PART OF KING HENRY THE FOURTH
    THE LIFE OF KING HENRY THE FIFTH
    THE FIRST PART OF HENRY THE SIXTH
    THE SECOND PART OF KING HENRY THE SIXTH
    THE THIRD PART OF KING HENRY THE SIXTH
    KING HENRY THE EIGHTH
    THE LIFE AND DEATH OF KING JOHN
    THE TRAGEDY OF JULIUS CAESAR
    THE TRAGEDY OF KING LEAR
    LOVE’S LABOUR’S LOST
    THE TRAGEDY OF MACBETH
    MEASURE FOR MEASURE
    THE MERCHANT OF VENICE
    THE MERRY WIVES OF WINDSOR
    A MIDSUMMER NIGHT’S DREAM
    MUCH ADO ABOUT NOTHING
    THE TRAGEDY OF OTHELLO, THE MOOR OF VENICE
    PERICLES, PRINCE OF TYRE
    KING RICHARD THE SECOND
    KING RICHARD THE THIRD
    THE TRAGEDY OF ROMEO AND JULIET
    THE TAMING OF THE SHREW
    THE TEMPEST
    

In [31]:

class CharTokenizer:
  def __init__(self, vocabulary):
    self.token_id_for_char = {char: token_id for token_id, char in enumerate(vocabulary)}
    self.char_for_token_id = {token_id: char for token_id, char in enumerate(vocabulary)}

  @staticmethod
  def train_from_text(text):
    vocabulary = set(text)
    return CharTokenizer(sorted(list(vocabulary)))

  def encode(self, text):
    token_ids = []
    for char in text:
      token_ids.append(self.token_id_for_char[char])
    return torch.tensor(token_ids, dtype=torch.long)

  def decode(self, token_ids):
    chars = []
    for token_id in token_ids.tolist():
      chars.append(self.char_for_token_id[token_id])
    return ''.join(chars)


  def vocabulary_size(self):
    return len(self.token_id_for_char)

In [32]:
tokenizer = CharTokenizer.train_from_text(text)

In [33]:
print(tokenizer.encode("Hello world"))
print(tokenizer.decode(tokenizer.encode("Hello world")))

tensor([31, 57, 64, 64, 67,  2, 75, 67, 70, 64, 56])
Hello world


In [34]:
print(f"Vocabulary size: {tokenizer.vocabulary_size()}")

Vocabulary size: 98


In [35]:
# Step 1 - Define the `TokenIdsDataset` Class

from torch.utils.data import Dataset

class TokenIdsDataset(Dataset):
  def __init__(self, data, block_size):
    # TODO: Save data and block size
    self.data = data
    self.block_size = block_size

  def __len__(self):
    # TODO: If every position can be a start of an item,
    # and all items should be "block_size", compute the size
    # of the dataset
    return len(self.data) - self.block_size

  def __getitem__(self, pos):
    # Check if the input position is valid
    if pos < len(self.data) - self.block_size:
        # Get an item from position "pos"
        # Get a target item (shifted by one position)
        x = self.data[pos:pos + self.block_size]
        y = self.data[pos + 1:pos + 1 + self.block_size]
        
        # Return both
        return x, y

In [36]:
# Step 2 - Tokenize the Text
tokenized_text = tokenizer.encode(text)
dataset = TokenIdsDataset(tokenized_text, block_size=64)

In [37]:
# Step 3 - Retrieve the First Item from the Dataset

# Get the first item from the dataset
x, y = dataset[0]
# Decode "x" using tokenizer.decode
tokenizer.decode(x)

'    THE SONNETS\n    ALL’S WELL THAT ENDS WELL\n    THE TRAGEDY OF'

In [27]:
from torch.utils.data import DataLoader, RandomSampler

# RandomSampler allows to read random items from a datasset
sampler = RandomSampler(dataset, replacement=True)
# Dataloader will laod two random samplers using the sampler
dataloader = DataLoader(dataset, batch_size=2, sampler=sampler)

In [38]:
# Step 4 - Use a DataLoader
from torch.utils.data import DataLoader, RandomSampler

# Get a single batch from the "dataloader"
# For this call the `iter` function, and pass DataLoader instance to it. This will create an iterator
# Then call the `next` function and pass the iterator to it to get the first training batch
sampler = RandomSampler(dataset, replacement=True)
dataloader = DataLoader(dataset, batch_size=2, sampler=sampler)
x, y = next(iter(dataloader))
x.shape

torch.Size([2, 64])

In [39]:
# Decode input item
tokenizer.decode(x[0])

'\n\n[_Aside._] Here comes a flattering rascal; upon him\nWill I fir'

In [40]:
# Decode target item
tokenizer.decode(y[0])


'\n[_Aside._] Here comes a flattering rascal; upon him\nWill I firs'