## Load data

In [149]:
import requests

# tiny shakespeare data
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
data = response.text

## Data Exploration

In [150]:
print("length of dataset in characters: ", len(data))

length of dataset in characters:  1115394


In [151]:
print(data[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [152]:
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(''.join(chars))
print('vocab size: ', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size:  65


## Data pre-processing

### Option 1: Tokenize in sub-words

In [153]:
import tiktoken

enc = tiktoken.get_encoding('gpt2')
enc.n_vocab

50257

In [154]:
print(enc.encode("Hello, World!"))
print(enc.decode(enc.encode("Hello, World!")))

[15496, 11, 2159, 0]
Hello, World!


### Option 2: Tokenize in characters

In [155]:
# create a mapping from characters to integers
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [char_to_int[ch] for ch in s] # encode a string, output is a list of integers
decode = lambda x: ''.join([int_to_char[i] for i in x]) # decode a list of integers, output is a string

print(encode("Hello, World!"))
print(decode(encode("Hello, World!")))

[20, 43, 50, 50, 53, 6, 1, 35, 53, 56, 50, 42, 2]
Hello, World!


### Train-test split

In [156]:
# Put the data into a tensor
import torch

# Move the data to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Encode the data
data_tensor = torch.tensor(encode(data), dtype=torch.long, device=device)

print(data_tensor.shape, data_tensor.dtype)
print(data_tensor[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [157]:
test_size = 0.9 # use 90% of the data for training, 10% for testing
train_data_tensor = data_tensor[:int(test_size*len(data_tensor))]
test_data_tensor = data_tensor[int(test_size*len(data_tensor)):]

In [158]:
block_size = 8
train_data_tensor[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

### Batch data

In [159]:
x = train_data_tensor[:block_size]
y = train_data_tensor[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context.tolist()}, target is {target}")

When input is [18], target is 47
When input is [18, 47], target is 56
When input is [18, 47, 56], target is 57
When input is [18, 47, 56, 57], target is 58
When input is [18, 47, 56, 57, 58], target is 1
When input is [18, 47, 56, 57, 58, 1], target is 15
When input is [18, 47, 56, 57, 58, 1, 15], target is 47
When input is [18, 47, 56, 57, 58, 1, 15, 47], target is 58


In [173]:
torch.manual_seed(1337)
batch_size = 8
block_size = 128

# Generates a small batch (size=batch_size) of data
# The data is stacked in a tensor of shape (batch_size x block_size)

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data_tensor if split == 'train' else test_data_tensor
    ix = torch.randint(len(data) - block_size, (batch_size,)) # starting index for each sequence
    x = torch.stack([data[i:i+block_size] for i in ix]) # input
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # target
    return x, y

xb, yb = get_batch('train')
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("------")

for b in range(batch_size):
    print("batch", b)
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()}, target is {target}")

inputs:
torch.Size([8, 128])
tensor([[ 1, 45, 59,  ...,  1, 55, 59],
        [52, 47, 57,  ..., 54, 56, 43],
        [50,  1, 15,  ..., 42, 43, 43],
        ...,
        [16, 33, 23,  ..., 58,  1, 39],
        [ 1, 53, 56,  ...,  0, 32, 46],
        [37, 10,  0,  ..., 54, 53, 57]])
targets:
torch.Size([8, 128])
tensor([[45, 59, 43,  ..., 55, 59, 43],
        [47, 57, 46,  ..., 56, 43, 60],
        [ 1, 15, 53,  ..., 43, 43, 42],
        ...,
        [33, 23, 17,  ...,  1, 39, 57],
        [53, 56,  1,  ..., 32, 46, 43],
        [10,  0, 32,  ..., 53, 57, 57]])
------
batch 0
When input is [1], target is 45
When input is [1, 45], target is 59
When input is [1, 45, 59], target is 43
When input is [1, 45, 59, 43], target is 57
When input is [1, 45, 59, 43, 57], target is 57
When input is [1, 45, 59, 43, 57, 57], target is 1
When input is [1, 45, 59, 43, 57, 57, 1], target is 61
When input is [1, 45, 59, 43, 57, 57, 1, 61], target is 46
When input is [1, 45, 59, 43, 57, 57, 1, 61, 46], tar

## Implement bigram language model

In [176]:
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):

        # idx: (batch_size, block_size) (B, T)
        # targets: (batch_size, block_size) (B, T)
        logits = self.token_embedding(idx) # shape: (batch_size, block_size, vocab_size) (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(-1)

            # How well are we predicting the next token based on the logits?
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, steps):
        # idx: (batch_size, block_size) (B, T)
        for _ in range(steps):
            # get the predictions
            logits, _ = self(idx)
            # get the last time step
            logits = logits[:, -1, :] # shape: (B, C)
            # get the probabilities
            probs = F.softmax(logits, dim=-1) # shape: (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # shape: (B, 1)
            # append to the sequence
            idx = torch.cat((idx, idx_next), dim=1) # shape: (B, T+1)
        return idx

    
model = BigramLanguageModel(vocab_size).to(device)
logits, loss = model(xb, yb)
print(logits.shape)
print(loss) # the expected loss should be close to -log(1/vocab_size) = -ln(1/65) = 4.17

idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(idx, steps=100)[0].tolist()))

torch.Size([1024, 65])
tensor(4.7254, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


## Train model

In [177]:
# Create a PyTorch Optimizer
# AdamW is a popular optimizer that is known to work well with transformers
# A learning rate of 1e-3 is a good starting point
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) 

In [185]:
batch_size = 32
for steps in range(10000):

    # get a batch of data
    xb, yb = get_batch('train')

    # forward pass
    logits, loss = model(xb, yb)

    # backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if steps % 1000 == 0:
        print(f"Step {steps}, Loss {loss.item()}")

Step 0, Loss 2.450605869293213
Step 1000, Loss 2.4364399909973145
Step 2000, Loss 2.4400222301483154
Step 3000, Loss 2.465681791305542
Step 4000, Loss 2.438530206680298
Step 5000, Loss 2.4517455101013184
Step 6000, Loss 2.479297399520874
Step 7000, Loss 2.4663240909576416
Step 8000, Loss 2.4558663368225098
Step 9000, Loss 2.4201338291168213


In [188]:
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(idx, steps=100)[0].tolist()))


METour t funosto be hanggales?
She
Thant be And willllloworve:
Mimimig.

KICHOR: se thime t y mad th
