In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
print(f"Using device: {device}")

Using device: cuda


In [9]:
from pathlib import Path

text = Path('tiny_shakespare.txt').read_text()

In [10]:
print(text[0:1000])

Title: The Complete Works of William Shakespeare

Author: William Shakespeare

Release date: January 1, 1994 [eBook #100]
                Most recently updated: October 29, 2024

Language: English



*** START OF THE PROJECT GUTENBERG EBOOK THE COMPLETE WORKS OF WILLIAM SHAKESPEARE ***
The Complete Works of William Shakespeare

by William Shakespeare




                    Contents

    THE SONNETS
    ALL’S WELL THAT ENDS WELL
    THE TRAGEDY OF ANTONY AND CLEOPATRA
    AS YOU LIKE IT
    THE COMEDY OF ERRORS
    THE TRAGEDY OF CORIOLANUS
    CYMBELINE
    THE TRAGEDY OF HAMLET, PRINCE OF DENMARK
    THE FIRST PART OF KING HENRY THE FOURTH
    THE SECOND PART OF KING HENRY THE FOURTH
    THE LIFE OF KING HENRY THE FIFTH
    THE FIRST PART OF HENRY THE SIXTH
    THE SECOND PART OF KING HENRY THE SIXTH
    THE THIRD PART OF KING HENRY THE SIXTH
    KING HENRY THE EIGHTH
    THE LIFE AND DEATH OF KING JOHN
    THE TRAGEDY OF JULIUS CAESAR
    THE TRAGEDY OF KING LEAR
    LOVE’S LABOUR’S

In [11]:
class CharTokenizer:
  def __init__(self, vocabulary):
    self.token_id_for_char = {char: token_id for token_id, char in enumerate(vocabulary)}
    self.char_for_token_id = {token_id: char for token_id, char in enumerate(vocabulary)}

  @staticmethod
  def train_from_text(text):
    vocabulary = set(text)
    return CharTokenizer(sorted(list(vocabulary)))

  def encode(self, text):
    token_ids = []
    for char in text:
      token_ids.append(self.token_id_for_char[char])
    return torch.tensor(token_ids, dtype=torch.long)

  def decode(self, token_ids):
    chars = []
    for token_id in token_ids.tolist():
      chars.append(self.char_for_token_id[token_id])
    return ''.join(chars)

  def vocabulary_size(self):
    return len(self.token_id_for_char)

In [12]:
tokenizer = CharTokenizer.train_from_text(text)

In [13]:
print(tokenizer.encode("Hello world"))

tensor([36, 62, 69, 69, 72,  2, 80, 72, 75, 69, 61])


In [14]:
print(tokenizer.decode(tokenizer.encode("Hello world")))

Hello world


In [15]:
tokenizer.vocabulary_size()

106

In [16]:
import pprint
pp = pprint.PrettyPrinter(depth=4)

In [17]:
pp.pprint(tokenizer.char_for_token_id)

{0: '\t',
 1: '\n',
 2: ' ',
 3: '!',
 4: '#',
 5: '$',
 6: '%',
 7: '&',
 8: "'",
 9: '(',
 10: ')',
 11: '*',
 12: ',',
 13: '-',
 14: '.',
 15: '/',
 16: '0',
 17: '1',
 18: '2',
 19: '3',
 20: '4',
 21: '5',
 22: '6',
 23: '7',
 24: '8',
 25: '9',
 26: ':',
 27: ';',
 28: '?',
 29: 'A',
 30: 'B',
 31: 'C',
 32: 'D',
 33: 'E',
 34: 'F',
 35: 'G',
 36: 'H',
 37: 'I',
 38: 'J',
 39: 'K',
 40: 'L',
 41: 'M',
 42: 'N',
 43: 'O',
 44: 'P',
 45: 'Q',
 46: 'R',
 47: 'S',
 48: 'T',
 49: 'U',
 50: 'V',
 51: 'W',
 52: 'X',
 53: 'Y',
 54: 'Z',
 55: '[',
 56: ']',
 57: '_',
 58: 'a',
 59: 'b',
 60: 'c',
 61: 'd',
 62: 'e',
 63: 'f',
 64: 'g',
 65: 'h',
 66: 'i',
 67: 'j',
 68: 'k',
 69: 'l',
 70: 'm',
 71: 'n',
 72: 'o',
 73: 'p',
 74: 'q',
 75: 'r',
 76: 's',
 77: 't',
 78: 'u',
 79: 'v',
 80: 'w',
 81: 'x',
 82: 'y',
 83: 'z',
 84: 'À',
 85: 'Æ',
 86: 'Ç',
 87: 'É',
 88: 'à',
 89: 'â',
 90: 'æ',
 91: 'ç',
 92: 'è',
 93: 'é',
 94: 'ê',
 95: 'ë',
 96: 'î',
 97: 'œ',
 98: '—',
 99: '‘',
 100: '’

In [18]:
pp.pprint(tokenizer.token_id_for_char)

{'\t': 0,
 '\n': 1,
 ' ': 2,
 '!': 3,
 '#': 4,
 '$': 5,
 '%': 6,
 '&': 7,
 "'": 8,
 '(': 9,
 ')': 10,
 '*': 11,
 ',': 12,
 '-': 13,
 '.': 14,
 '/': 15,
 '0': 16,
 '1': 17,
 '2': 18,
 '3': 19,
 '4': 20,
 '5': 21,
 '6': 22,
 '7': 23,
 '8': 24,
 '9': 25,
 ':': 26,
 ';': 27,
 '?': 28,
 'A': 29,
 'B': 30,
 'C': 31,
 'D': 32,
 'E': 33,
 'F': 34,
 'G': 35,
 'H': 36,
 'I': 37,
 'J': 38,
 'K': 39,
 'L': 40,
 'M': 41,
 'N': 42,
 'O': 43,
 'P': 44,
 'Q': 45,
 'R': 46,
 'S': 47,
 'T': 48,
 'U': 49,
 'V': 50,
 'W': 51,
 'X': 52,
 'Y': 53,
 'Z': 54,
 '[': 55,
 ']': 56,
 '_': 57,
 'a': 58,
 'b': 59,
 'c': 60,
 'd': 61,
 'e': 62,
 'f': 63,
 'g': 64,
 'h': 65,
 'i': 66,
 'j': 67,
 'k': 68,
 'l': 69,
 'm': 70,
 'n': 71,
 'o': 72,
 'p': 73,
 'q': 74,
 'r': 75,
 's': 76,
 't': 77,
 'u': 78,
 'v': 79,
 'w': 80,
 'x': 81,
 'y': 82,
 'z': 83,
 'À': 84,
 'Æ': 85,
 'Ç': 86,
 'É': 87,
 'à': 88,
 'â': 89,
 'æ': 90,
 'ç': 91,
 'è': 92,
 'é': 93,
 'ê': 94,
 'ë': 95,
 'î': 96,
 'œ': 97,
 '—': 98,
 '‘': 99,
 '’': 10