In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [4]:
# read in all the text
text = open("input.txt", "r", encoding="utf-8").read().splitlines()
print(len(text))
print(max(len(w) for w in text))
print(text[:8])

90978
215
['MOBY DICK; OR THE WHALE ', '', 'by Herman Melville', '', '', '', '', 'ETYMOLOGY.']


In [13]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set("".join(text))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '0', 10: '1', 11: '2', 12: '3', 13: '4', 14: '5', 15: '6', 16: '7', 17: '8', 18: '9', 19: ':', 20: ';', 21: '?', 22: 'A', 23: 'B', 24: 'C', 25: 'D', 26: 'E', 27: 'F', 28: 'G', 29: 'H', 30: 'I', 31: 'J', 32: 'K', 33: 'L', 34: 'M', 35: 'N', 36: 'O', 37: 'P', 38: 'Q', 39: 'R', 40: 'S', 41: 'T', 42: 'U', 43: 'V', 44: 'W', 45: 'X', 46: 'Y', 47: 'Z', 48: 'a', 49: 'b', 50: 'c', 51: 'd', 52: 'e', 53: 'f', 54: 'g', 55: 'h', 56: 'i', 57: 'j', 58: 'k', 59: 'l', 60: 'm', 61: 'n', 62: 'o', 63: 'p', 64: 'q', 65: 'r', 66: 's', 67: 't', 68: 'u', 69: 'v', 70: 'w', 71: 'x', 72: 'y', 73: 'z'}
73


In [14]:
# create a mapping from characters to integers


def encode(s):
    return [stoi[c] for c in s]                     # encoder: take a string, output a list of integers
def decode(lst):
    return "".join([itos[i] for i in lst])            # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[55, 56, 56, 1, 67, 55, 52, 65, 52]
hii there


In [15]:
# build the dataset
block_size = (
    8  # context length: how many characters do we take to predict the next one?
)


def build_dataset(text):
    X, Y = [], []

    for w in text:
        context = [0] * block_size
        for ch in w + ".":
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]  # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y


n1 = int(0.8 * len(text))
n2 = int(0.9 * len(text))
Xtr, Ytr = build_dataset(text[:n1])  # 80%
Xdev, Ydev = build_dataset(text[n1:n2])  # 10%
Xte, Yte = build_dataset(text[n2:])  # 10%

torch.Size([3163351, 8]) torch.Size([3163351])
torch.Size([257741, 8]) torch.Size([257741])
torch.Size([237823, 8]) torch.Size([237823])


In [16]:
for x, y in zip(Xtr[:20], Ytr[:20]):
    print("".join(itos[ix.item()] for ix in x), "-->", itos[y.item()])

KeyError: 0

In [15]:
# let's now encode the entire text dataset and store it into a torch.Tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(
    data[:1000]
)  # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([3658915]) torch.int64
tensor([34, 36, 23, 46,  1, 25, 30, 24, 32, 20,  1, 36, 39,  1, 41, 29, 26,  1,
        44, 29, 22, 33, 26,  1,  0,  0, 49, 72,  1, 29, 52, 65, 60, 48, 61,  1,
        34, 52, 59, 69, 56, 59, 59, 52,  0,  0,  0,  0,  0, 26, 41, 46, 34, 36,
        33, 36, 28, 46,  8,  0,  0, 40, 68, 63, 63, 59, 56, 52, 51,  1, 49, 72,
         1, 48,  1, 33, 48, 67, 52,  1, 24, 62, 61, 66, 68, 60, 63, 67, 56, 69,
        52,  1, 42, 66, 55, 52, 65,  1, 67, 62,  1, 48,  1, 28, 65, 48, 60, 60,
        48, 65,  1, 40, 50, 55, 62, 62, 59,  0,  0, 41, 55, 52,  1, 63, 48, 59,
        52,  1, 42, 66, 55, 52, 65,  7,  7, 67, 55, 65, 52, 48, 51, 49, 48, 65,
        52,  1, 56, 61,  1, 50, 62, 48, 67,  6,  1, 55, 52, 48, 65, 67,  6,  1,
        49, 62, 51, 72,  6,  1, 48, 61, 51,  1, 49, 65, 48, 56, 61, 20,  1, 30,
         1, 66, 52, 52,  1, 55, 56, 60,  0, 61, 62, 70,  8,  1,  1, 29, 52,  1,
        70, 48, 66,  1, 52, 69, 52, 65,  1, 51, 68, 66, 67, 56, 61, 54,  1, 55,
      

In [53]:
# Let's now split up the data into train and validation sets
chunk_size = len(data) // 100  # Size of each of the 100 chunks
train_indices = []
val_indices = []

# Assign chunks to train and validation sets
for i in range(100):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size if i < 99 else len(data)
    if i % 10 == 0:  # Every 10th chunk goes to validation
        val_indices.extend(range(start_idx, end_idx))
    else:  # The rest go to training
        train_indices.extend(range(start_idx, end_idx))

# Create train and validation datasets
train_data = data[train_indices]
val_data = data[val_indices]

# Print the sizes to verify
print(f"Total data size: {len(data)}")
print(f"Train set size: {len(train_data)} ({len(train_data)/len(data):.1%})")
print(f"Valid set size: {len(val_data)} ({len(val_data)/len(data):.1%})")

Total data size: 3658915
Train set size: 3293025 (90.0%)
Valid set size: 365890 (10.0%)


Checking if I can decode the validation set and look at some of the text in there.

In [24]:
0 % 10

0

In [25]:
for i in range(20):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


Ok, makes sense because of 0 indexing the very start would be part of the validation set.

I can do this to only get some of the list printed and decoded, obviously.

In [49]:
code = encode("hii there")

print(decode(code[:4]))

hii 


Ok, the Trick is to use `.tolist()`

In [48]:
small_data = data[:1000]
print(decode(small_data.tolist()))

MOBY DICK; OR THE WHALE 

by Herman Melville




ETYMOLOGY.

Supplied by a Late Consumptive Usher to a Grammar School

The pale Usher--threadbare in coat, heart, body, and brain; I see him
now.  He was ever dusting his old lexicons and grammars, with a queer
handkerchief, mockingly embellished with all the gay flags of all the
known nations of the world.  He loved to dust his old grammars; it
somehow mildly reminded him of his mortality.

While you take in hand to school others, and to teach them by what
name a whale-fish is to be called in our tongue leaving out, through
ignorance, the letter H, which almost alone maketh the signification
of the word, you deliver that which is not true. --HACKLUYT

WHALE. ... Sw. and Dan. HVAL.  This animal is named from roundness
or rolling; for in Dan. HVALT is arched or vaulted. --WEBSTER'S
DICTIONARY

WHALE. ... It is more immediately from the Dut. and Ger. WALLEN;
A.S. WALW-IAN, to roll, to wallow. --RICHARDSON'S DICTIONARY

KETOS,               

Ok, now this worked, and I can see that `train_data` really is different.

In [51]:
print(decode(train_data.tolist()))

patched boots are stopping the way.  So on I
went.  I now by instinct followed the streets that took me waterward,
for there, doubtless, were the cheapest, if not the cheeriest inns.

Such dreary streets! blocks of blackness, not houses, on either
hand, and here and there a candle, like a candle moving about in a
tomb.  At this hour of the night, of the last day of the week, that
quarter of the town proved all but deserted.  But presently I came to
a smoky light proceeding from a low, wide building, the door of which
stood invitingly open.  It had a careless look, as if it were meant
for the uses of the public; so, entering, the first thing I did was
to stumble over an ash-box in the porch.  Ha! thought I, ha, as the
flying particles almost choked me, are these ashes from that
destroyed city, Gomorrah?  But The Crossed Harpoons, and The
Sword-Fish?--this, then must needs be the sign of The Trap.
However, I picked myself up and hearing a loud voice within, pushed
on and opened a second,

This is great, I can see a couple of snippets of the training data, and yes, all the books are there.

In [69]:
k = len(train_data) // 8

for n in range(7):

    print(decode(train_data[n * k : n * k + 200].tolist()) + "\n---")

patched boots are stopping the way.  So on I
went.  I now by instinct followed the streets that took me waterward,
for there, doubtless, were the cheapest, if not the cheeriest inns.

Such dreary stre
---
ly vivacious, tumultuous,
ostentatious little Flask would now and then stamp with impatience;
but not one added heave did he thereby give to the negro's lordly
chest.  So have I seen Passion and Vanit
---
t
is, Virtue and Vice; we try to reach Virtue, when lo! comes Cancer
the Crab, and drags us back; and here, going from Virtue, Leo, a
roaring Lion, lies in the path--he gives a few fierce bites and su
---
that sea when he came to it.
Oh, said Ana.
What about you? I asked. You know, the Comet Kings wife
was . . . 

84

CHAPTER 6

Im not your wife, said Ana. The whole marriage ritual was a
test. Im glad 
---
aternal
line. I am nothing.
Christianity doesnt pass through any line, said Father Ellis. Its
open to anyone who wants it. And I mention it because it has a lot
to say on the issue

In [71]:
block_size = 8
train_data[: block_size + 1]

tensor([63, 48, 67, 50, 55, 52, 51,  1, 49])

In [73]:
x = train_data[:block_size]
y = train_data[1 : block_size + 1]
for t in range(block_size):
    context = x[: t + 1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([63]) the target: 48
when input is tensor([63, 48]) the target: 67
when input is tensor([63, 48, 67]) the target: 50
when input is tensor([63, 48, 67, 50]) the target: 55
when input is tensor([63, 48, 67, 50, 55]) the target: 52
when input is tensor([63, 48, 67, 50, 55, 52]) the target: 51
when input is tensor([63, 48, 67, 50, 55, 52, 51]) the target: 1
when input is tensor([63, 48, 67, 50, 55, 52, 51,  1]) the target: 49


This makes sense.