<a href="https://colab.research.google.com/github/SwimLane2/udacity-ai-masters/blob/main/AI%20Programming%20with%20Python/Introduction-to-transformer-neural-networks/02_char_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
print(f"Using device: {device}")

Using device: cpu


In [5]:
from pathlib import Path

text = Path('/content/tiny-shakespeare.txt').read_text()

In [6]:
print(text[0:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [7]:
class CharTokenizer:
    def __init__(self, vocabulary):
        self.token_id_for_char = {
            char: token_id for token_id, char in enumerate(vocabulary)
        }
        self.char_for_token_id = {
            token_id: char for token_id, char in enumerate(vocabulary)
        }

    @staticmethod
    def train_from_text(text):
        vocabulary = set(text)
        return CharTokenizer(sorted(list(vocabulary)))

    def encode(self, text):
        token_ids = []
        for char in text:
            token_ids.append(self.token_id_for_char[char])
        return torch.tensor(token_ids, dtype=torch.long)

    def decode(self, token_ids):
        chars = []
        for token_id in token_ids.tolist():
            chars.append(self.char_for_token_id[token_id])
        return "".join(chars)

    def vocabulary_size(self):
        return len(self.token_id_for_char)

In [8]:
tokenizer = CharTokenizer.train_from_text(text)

In [9]:
print(tokenizer.encode("Hello world"))

tensor([20, 43, 50, 50, 53,  1, 61, 53, 56, 50, 42])


In [10]:
print(tokenizer.decode(tokenizer.encode("Hello world")))

Hello world


In [11]:
tokenizer.vocabulary_size()

65

In [12]:
import pprint
pp = pprint.PrettyPrinter(depth=4)

In [13]:
pp.pprint(tokenizer.char_for_token_id)

{0: '\n',
 1: ' ',
 2: '!',
 3: '$',
 4: '&',
 5: "'",
 6: ',',
 7: '-',
 8: '.',
 9: '3',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: 'a',
 40: 'b',
 41: 'c',
 42: 'd',
 43: 'e',
 44: 'f',
 45: 'g',
 46: 'h',
 47: 'i',
 48: 'j',
 49: 'k',
 50: 'l',
 51: 'm',
 52: 'n',
 53: 'o',
 54: 'p',
 55: 'q',
 56: 'r',
 57: 's',
 58: 't',
 59: 'u',
 60: 'v',
 61: 'w',
 62: 'x',
 63: 'y',
 64: 'z'}


In [14]:
pp.pprint(tokenizer.token_id_for_char)

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}


In [22]:
# Step 1 - Define the `TokenIdsDataset` Class

from torch.utils.data import Dataset

class TokenIdsDataset(Dataset):
    def __init__(self, data, block_size):
        # TODO: Save data and block size
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # TODO: If every position can be a start of an item,
        # and all items should be "block_size", compute the size
        # of the dataset
        return len(self.data) - self.block_size

    def __getitem__(self, pos):
        # TODO: Check if the input position is valid
        assert pos < len(self.data) - self.block_size

        # TODO: Get an item from position "pos"
        # TODO: Get a target item (shifted by one position)

        x = self.data[pos:pos + self.block_size]
        y = self.data[pos + 1:pos + 1 + self.block_size]

        return x, y

        # TODO: Return both

In [25]:
# Step 2 - Tokenize the Text

# TODO: Encode text using the tokenizer
# Create "TokenIdsDataset" with the tokenized text, and block_size=64
tokenized_text = tokenizer.encode(text)
dataset = TokenIdsDataset(tokenized_text, block_size=64)

In [26]:
# Step 3 - Retrieve the First Item from the Dataset

# TODO: Get the first item from the dataset
# Decode "x" using tokenizer.decode

x, y = dataset[0]
print(tokenizer.decode(x))

First Citizen:
Before we proceed any further, hear me speak.

Al


In [28]:
from torch.utils.data import DataLoader, RandomSampler

# RandomSampler allows to read random items from a datasset
sampler = RandomSampler(dataset, replacement=True)
# Dataloader will laod two random samplers using the sampler
dataloader = DataLoader(dataset, batch_size=2, sampler=sampler)

In [29]:
# Step 4 - Use a DataLoader

# TODO: Get a single batch from the "dataloader"
# For this call the `iter` function, and pass DataLoader instance to it. This will create an iterator
# Then call the `next` function and pass the iterator to it to get the first training batch

x, y = next(iter(dataloader))

In [30]:
print(x.shape)

torch.Size([2, 64])


In [31]:
# TODO: Decode input item
print(tokenizer.decode(x[0]))



 the signal to begin.

Lord Marshal:
Sound, trumpets; and set fo


In [32]:
# TODO: Decode target item
print(tokenizer.decode(y[0]))

the signal to begin.

Lord Marshal:
Sound, trumpets; and set for
