In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [15]:
learning_rate = 3e-4
max_iters = 2000
batch_size = 32
block_size = 128
eval_iters = 500
n_embed = 384 
n_layers = 4
n_head = 4
dropout = 0.2

In [3]:
with open('WizardOfOz.txt','r', encoding='utf-8') as f:
    text = f.read()

print(text[:200])

The Project Gutenberg eBook of The Wonderful Wizard of Oz
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restri


In [4]:
chars = sorted(list(set(text)))
print(chars)

vocab_size = len(chars)

['\n', ' ', '!', '#', '$', '%', '&', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '•', '™', '\ufeff']


In [5]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [string_to_int[i] for i in s]
decode = lambda s: [int_to_string[i] for i in s]

In [6]:
data = torch.tensor(encode(text), dtype=torch.long)

In [7]:
data[:100]

tensor([88, 46, 62, 59,  1, 42, 72, 69, 64, 59, 57, 74,  1, 33, 75, 74, 59, 68,
        56, 59, 72, 61,  1, 59, 28, 69, 69, 65,  1, 69, 60,  1, 46, 62, 59,  1,
        49, 69, 68, 58, 59, 72, 60, 75, 66,  1, 49, 63, 80, 55, 72, 58,  1, 69,
        60,  1, 41, 80,  0,  1,  1,  1,  1,  0, 46, 62, 63, 73,  1, 59, 56, 69,
        69, 65,  1, 63, 73,  1, 60, 69, 72,  1, 74, 62, 59,  1, 75, 73, 59,  1,
        69, 60,  1, 55, 68, 79, 69, 68, 59,  1])

In [8]:
n = int(0.8*len(data))
train = data[:n]
val = data[n:]

def get_batch(split):
    data = train if split=='train' else val
    ix = torch.randint(len(data)-block_size,(batch_size, ))

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x,y

x,y = get_batch('train')
print(x)
print()
print(y)

tensor([[63, 68, 74,  ..., 55, 68, 58],
        [59, 72,  1,  ..., 74,  1, 72],
        [ 1, 77, 63,  ..., 59,  1, 74],
        ...,
        [ 1, 39, 69,  ...,  1, 62, 55],
        [69, 55, 58,  ..., 74, 62, 59],
        [69, 75,  1,  ..., 69, 66, 58]], device='cuda:0')

tensor([[68, 74, 79,  ..., 68, 58,  0],
        [72,  1, 74,  ...,  1, 72, 75],
        [77, 63, 74,  ...,  1, 74, 62],
        ...,
        [39, 69, 68,  ..., 62, 55, 74],
        [55, 58, 10,  ..., 62, 59,  1],
        [75,  1, 55,  ..., 66, 58,  1]], device='cuda:0')


In [9]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        if targets is None:
            loss = None
        else:
            # B is batch
            # T is time, sequence of integers
            # C is channels - Vocab size
            B, T, C = logits.shape
            
            '''
            Now view is used to reshape the tensor
            We did this because for cross_entropy, we need the dimensions to be different as per the documentation
            '''
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # Index is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            logits, loss = self.forward(index)
            # -1 as we want the last token only as it is Bigram
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1)
            # Sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())

print(''.join(generated_chars))




6tdYMRXra)!wwQ;VjOXjM$/QK%h)v58TR“’iRAu’Q#VPu’H;!•‘T8pikRv3kFepIjxnLziVYZuEzcL9Ni,.k&Ea6e&v)(n9*ySn$9F%’A“p!-AyJV!UoeZ7-&jSbtGJ&’WYZKZ)j%V7k]k3w]o*•jrUdG&-xIeul™o43D16h#;bN#﻿/[TD2?,”W:)dnFQt•YcF*z)AZ)V[4.cn*q ‘SF™w—XL#FO8xWB3a’,tlXmd
KJ $nhuR h,C0b#sDA8]DG﻿J$,w;!‘/he3a
jql”-•k$h,bTx﻿s)fK™9Nc“,XjhY9IYS;432T”’t6WAZ.*,G(5]s0hW
H(DMkZDMUeN3$t[ur4HcBQE;fs1%bqK/4”“8U$?“’;!5HScrKYH$3zkL59hFjGh
]$D’
daA:ilZu]8Ist11”O
R [/f™ 0H$HQReOvG);4o%n! Tm
hr‘-]9WzqHDQ9Z“iT6YmN3%n.9dpVji43SbDM
m‘M0a?Z)*v;WR%n iK*Mk


In [9]:
# So that is doesn't use gradient as we are just getting the loss wherever it uses model inside
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()    #When model is being tested, droupout and batchnorm should be turned off
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x,y = get_batch(split)
            x,y = x.to(device), y.to(device)
            logits, loss = model(x,y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()   #Set mode to training mode
    return out

In [14]:
# Optimizer

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    #Sample a batch of data
    if iter % eval_iters==0:
        losses = estimate_loss()
        print(f'Step: {iter}, Loss: {losses}')
    xb, yb = get_batch('train')
    #Evaluate the loss
    logits, loss = model(xb, yb)
    #Set gradient to None each time instead of 0 to save memory
    optimizer.zero_grad(set_to_none=True)   #So that previous gradient doesn't effect current
    #Backpropagate the gradients
    loss.backward()
    #Update the weights
    optimizer.step()

print(loss.item())

Step: 0, Loss: {'train': tensor(4.8373), 'val': tensor(4.8587)}
4.699252128601074


In [11]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(''.join(generated_chars))


yS“Wd
yJ“Paz&sDwBNIzR2’$WP7SH;Z“Ox﻿8p2’HUlnUgda•Q;Z•Y T•)jjqL—SV[*MZ’SKZ1;•,bs/z’aksgMcq“)(•t[[YHU7qm“a”V1%6J&V—/NC#g.(
zZb%QeA:y4Xq*b﻿u!5p™ ,;V”Z)WOc2Wmcv)﻿)AU•KyWJMTSE%nrm1Eo4’‘NCRL 74tH”5ul﻿60TCyS67poE%1j1Ng—u“’i!•U*yS—2S)t”jj3w•,B-K2F-Y
6TRW7$b
?M/—za*vnLLuk0$-9w’Kt&Zg?UtHcq”wnYPko6Sco0H74&BO
F%-YEbK$t4OwleX-V07j(Pn*.hmNW46r j8Necu] [•y131cB$El
*v;f%]oEZNTzZ)*rfg’iZq/sTglzO—?56Z59[uu!I”2?
[s••DM,;W!YkTrEg49S%fPNok2yNkRWbsuuFRH)aGu•VIjxdhTrPa?Z%W
2(hTIZKt5r#
™hJ*rUX3oprU
FJYZ-*Gu/J’oOXmbNCRl%


In [10]:
class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4*n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)     #Drop certain neurons and make 0 to prevent overfitting
        )

    def forward(self,x):
        return self.net(x)

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query =   nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        #Prevent overhead processing for masking
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.drouput = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)

        #Compute attention scores
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] ==0, float('-inf')) # B,T,T
        
        wei = F.softmax(wei, dim=-1)
        v = self.value(x)
        out = wei @ v
        return out
    
    
class MultiHeadAttention(nn.Module):
    ''' Multiple heads of self-attention in parallel'''
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size*num_heads, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) #Last one is Channel dimenion or feature dim
        out = self.dropout(self.proj(out))
        return out
    

In [16]:
class Block(nn.Module):
    '''Transformer Block'''
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x+y)
        y = self.ffwd(x)
        x = self.ln2(x+y)
        return x
    

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)   #Vocab size give prob of each token

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            
    def forward(self, index, targets=None):
        B, T= index.shape
        tok_emb = self.token_embedding_table(index)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)

        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # Index is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            index_cond = index[:, -block_size: ]
            logits, loss = self.forward(index_cond)
            # -1 as we want the last token only as it is Bigram
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1)
            # Sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
        return index
    
model = GPTLanguageModel(vocab_size)
m = model.to(device)

In [17]:
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    #Sample a batch of data
    if iter % eval_iters==0:
        losses = estimate_loss()
        print(f'Step: {iter}, Loss: {losses}')
    xb, yb = get_batch('train')
    #Evaluate the loss
    logits, loss = model(xb, yb)
    #Set gradient to None each time instead of 0 to save memory
    optimizer.zero_grad(set_to_none=True)   #So that previous gradient doesn't effect current
    #Backpropagate the gradients
    loss.backward()
    #Update the weights
    optimizer.step()

print(loss.item())

Step: 0, Loss: {'train': tensor(4.5543), 'val': tensor(4.5532)}
Step: 500, Loss: {'train': tensor(1.5387), 'val': tensor(2.1138)}
Step: 1000, Loss: {'train': tensor(1.2455), 'val': tensor(2.0012)}
Step: 1500, Loss: {'train': tensor(1.0781), 'val': tensor(1.9170)}
1.0685702562332153


In [24]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context,max_new_tokens=500).tolist()[0])


In [25]:
context

tensor([[0]], device='cuda:0')

In [19]:
print(''.join(generated_chars))


drar inn and made of his room into sorrous a long ling day, and Toto
come to sleep them rioss nothing them back on the Winged Monkeys,
who would no make has you have if I certain anyone only ordered Kansas, and I have
roar be
jour out the road, of sharpy her tears so that the him, and a very
kind at so treates thread Wizard.”

“Well marry the Tin Woodman tiress,” replied Oz.ed “You have to look all
that Oz to see who well near that in on they crept her fever lust, upon my
nor a small time. In, a


In [27]:
context = torch.tensor([encode("What to do now ?")], dtype=torch.long, device=device)
generated_chars = decode(model.generate(context,max_new_tokens=500).tolist()[0])
print(''.join(generated_chars))


What to do now ?”

“I don’t before Oz,” said Dorothy, who helplie her and stuffed with
straw, he sprited up
the same throting,” said the Tin Woodman; “I have no
keep of the eyes.”

“In you well never heart,” replied Oz, “who will never know his
save wish a heart of destrow, and he grick betches of the fire, so sure
I’ll so afraid to the ling.”

“What any well, and it isn’t fewer a cuntire,” dremarked Dorothy sparkled off the Scarecrow. “It
am a Cowardly Lion, and you will help you and so back that all that I
sh
