In [29]:
#download the dataset
!curl -O https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  5278k      0 --:--:-- --:--:-- --:--:-- 5287k


In [30]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Length of text: {len(text)} characters")
print(text[:500])  # print the first 500 characters

Length of text: 1115394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [31]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(f"Vocabulary size: {vocab_size} unique characters")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary size: 65 unique characters


In [32]:
#encoder and decoder
stoi = { ch: i for i, ch in enumerate(chars) }
itos = { i: ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] #take a string -> output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) #take a list of integers -> output a string

print(encode("hello world"))
print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


In [33]:
import torch 
data = torch.tensor(encode(text), dtype=torch.long) #convert to tensor
print(data.shape, data.dtype)
print(data[:100]) 

n = int(0.9 * len(data)) #90% for training, 10% for validation|
train_data = data[:n]
val_data = data[n:]

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [34]:
torch.manual_seed(1337)  # for reproducibility
block_size = 8 # maximum context size
batch_size = 4 # how many sequences to process at in parallel

def get_batch(split):
    # generate a small, random batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y


xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [43]:
#let's implement bigram language model 

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)  # for reproducibility

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        # Row i (of the embedding table) = logits for the next token given current token i
        # It’s directly acting as a conditional probability table for bigrams
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)

        if targets is None:
            loss = None
        else:    
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            last = idx[:, -1]                           # (B,)
            logits = self.token_embedding_table(last)   # (B, C)
            probs = F.softmax(logits, dim=-1)           # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)     # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)

print(logits.shape)
print(logits)
print(loss)

torch.Size([4, 8, 65])
tensor([[[-1.5101, -0.0948,  1.0927,  ..., -0.6126, -0.6597,  0.7624],
         [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
         [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
         ...,
         [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
         [ 1.0901,  0.2170, -2.9996,  ..., -0.5472, -0.8017,  0.7761],
         [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594]],

        [[ 1.0541,  1.5018, -0.5266,  ...,  1.8574,  1.5249,  1.3035],
         [-0.1324, -0.5489,  0.1024,  ..., -0.8599, -1.6050, -0.6985],
         [-0.6722,  0.2322, -0.1632,  ...,  0.1390,  0.7560,  0.4296],
         ...,
         [ 1.0901,  0.2170, -2.9996,  ..., -0.5472, -0.8017,  0.7761],
         [ 1.1513,  1.0539,  3.4105,  ..., -0.5686,  0.9079, -0.1701],
         [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305]],

        [[-0.2103,  0.4481,  1.2381,  ...,  1.3597, -0.0821,  0.3909],
         [ 0.2475, -0.

In [45]:

idx = torch.zeros((1, 1), dtype=torch.long) # starting context
print(decode(m.generate(idx, max_new_tokens=200)[0].tolist()))


pJ:Bpm&yiltNCjeO3:Cx&vvMYW-txjuAd IRFbTpJ$zkZelxZtTlHNzdXXUiQQY:qFINTOBNLI,&oTigq z.c:Cq,SDXzetn3XVjX-YBcHAUhk&PHdhcOb
nhJ?FJU?pRiOLQeUN!BxjPLiq-GJdUV'hsnla!murI!IM?SPNPq?VgC'R
pD3cLv-bxn-tL!upg
SZ!Uv


In [46]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(100000):
    
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)


    optimizer.zero_grad(set_to_none=True)
    loss.backward()                     # autograd: fills .grad for all params used
    optimizer.step()                    # updates params

    print(loss.item())

2.46045184135437
2.530797004699707
2.629579544067383
2.5213851928710938
2.4578747749328613
2.5464324951171875
2.3166770935058594
2.410921812057495
2.4164481163024902
2.429089069366455
2.482952356338501
2.380098581314087
2.5347955226898193
2.355064630508423
2.3721606731414795
2.3456790447235107
2.403684377670288
2.4027795791625977
2.3915019035339355
2.4120688438415527
2.5391600131988525
2.475058078765869
2.4842782020568848
2.353908061981201
2.507761001586914
2.407942056655884
2.474951982498169
2.3941879272460938
2.4006762504577637
2.3454527854919434
2.3945860862731934
2.39258074760437
2.5020339488983154
2.4625091552734375
2.545285940170288
2.435274362564087
2.451559066772461
2.5446300506591797
2.3720245361328125
2.4026854038238525
2.3829731941223145
2.6108317375183105
2.3927736282348633
2.526564359664917
2.604414224624634
2.397918701171875
2.4670605659484863
2.3110928535461426
2.6149187088012695
2.4285778999328613
2.648042678833008
2.523921251296997
2.3226075172424316
2.394595146179199


In [53]:
idx = torch.zeros((1, 1), dtype=torch.long) # starting context
print(decode(m.generate(idx, max_new_tokens=200)[0].tolist()))


Say dsh inousm.
ABef NGomy,
St oret.
y w? s Eat wigh, f whe tro iff t theknkibulll y fomy
GEO, be-avick ishe h, lom.
Therther re, s ir MI y s'stee ARI isu s ave y I


ce heamert io ombrdoun llllp.
FRL
