In [21]:
import torch
device='cuda' if torch.cuda.is_available() else 'cpu'
print(device)
batch_size=4
block_size=8
learning_rate=3e-4
max_iters=1000
eval_iter=250
#to handle tensor data structures 

cuda


In [7]:
#using Wizard of oz text
with open("wizard of oz.txt", "r", encoding='utf-8') as f:
    text=f.read()
#extract unique characters from the text, we are creating our vocabulary to work with
chars=sorted(set(text))
vocab_size=len(chars)
print(chars, vocab_size)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff'] 81


In [8]:
#we are using character-level characterization for each character we will encode the character to an integer and we will also form a decoding operation
#creating a dictionary mapping characters to integer and then integers back to characters
string_to_int= {ch:i for i , ch in enumerate(chars)} 
int_to_string = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join(int_to_string[i] for i in l)
en_hello=encode('hello')
de_hello=decode(en_hello)
print(en_hello, de_hello)

[61, 58, 65, 65, 68] hello


In [9]:
#we will convert the text into tensors as having large string is not efficient to work with
data= torch.tensor(encode(text), dtype=torch.long)# conveting the integer form of the whole text as a tensor
print(data[:50])

tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,  0,
         0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25])


In [10]:
#dividing the data into training and testing
n=int(0.8*len(data))
train_data=data[:n]
val_data=data[n:]
def get_batch(split):
    data = train_data if split=='train' else val_data
    ix=torch.randint(len(data)-block_size,(batch_size,))
    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:i+(block_size+1)]for i in ix])
    x,y=x.to(device),y.to(device)
    return x,y
x,y=get_batch(train_data)
print("inputs:")
print(x)
print("outputs:")
print(y)

inputs:
tensor([[56, 68, 71, 67, 58, 71,  1, 68],
        [68,  1, 60, 58, 73,  1, 61, 68],
        [61, 58, 71, 58,  1, 76, 54, 72],
        [ 1, 73, 61, 58,  1, 56, 68, 71]], device='cuda:0')
outputs:
tensor([[68, 71, 67, 58, 71,  1, 68, 59],
        [ 1, 60, 58, 73,  1, 61, 68, 66],
        [58, 71, 58,  1, 76, 54, 72,  1],
        [73, 61, 58,  1, 56, 68, 71, 68]], device='cuda:0')


In [18]:
@torch.no_grad #pytorch doesn't do any gradient computation because we only want to compute loss here
def estimate_loss():
    model.eval()
    out={}
    for split in ['train','val']:
        losses=torch.zeros(eval_iter)
        
        for k in range(eval_iter):
            x,y=get_batch(split)
            Logits,loss=model.forward(x,y)
            losses[k]=loss.item()
        out[split]=losses.mean()
    model.train()
    return out    
        

In [19]:
import torch.nn as nn
from torch.nn import functional as F
class BigramLanguageModel(nn.Module):#class BigramLanguageModel inherits from nn.Module class 
    def __init__(self,vocab_size):
        ##__init__ is a constructor method called when an instance of the class is created it takes argument vocab_size
        super().__init__()#calling constructor method of the parent class, important when inheritance is there
        self.embedding_layer_table=nn.Embedding(vocab_size,vocab_size)#instance of the class is created as self. 
        #this is an embedding layer with same input and output size
        
    def forward(self,index,targets=None):#forward pass method
            logits=self.embedding_layer_table(index)#applies embeddig layer to input indices
            if targets==None:
                loss=None
            else:
                B,T,C=logits.shape#extrating the dimension of logits as Batch, Time and Channel
                logits=logits.view(B*T,C)#to treat each element in the sequence as individual prediction
                targets=targets.view(B*T)#each prediction correspondinly has one target
                loss=F.cross_entropy(logits,targets) #computes loss between input indices and target indices
            
            return logits,loss
        
    def generate(self,index,max_new_tokens):#creates a sequence of new indices based on input indices
            #index is a (B,T) array of indices in current context
            for _ in range(max_new_tokens):
                #get new predictions
                logits,loss=self.forward(index)
                #consider only last time stamp to focus on the most recent prediction
                logits=logits[:,-1,:]#becomes (B,C)
                #apply softmax to get probabilities
                probs=F.softmax(logits,dim=-1)#(B,1)
                index_next=torch.multinomial(probs,num_samples=1)#(B,T+1)
                #append new index to the running sequence
                index=torch.cat((index,index_next), dim=1)#updating index 
                
            return index



In [13]:
model=BigramLanguageModel(vocab_size)
m=model.to(device)

context=torch.zeros((1,1),dtype=torch.long,device=device)#from where to start the sequence
print(context)
generated_chars=decode(m.generate(context,max_new_tokens=500)[0].tolist())
#[0] is used to extract only the tensor as the generated sequence is returned as a tuple
#tolist() converts the tensor into a python list
#decode() is converting integer to string
print(generated_chars)

tensor([[0]], device='cuda:0')

?VQIwIBnhYIE-Eypz exKRXjWrYk9o9oPD41oz;QC.﻿m'bldNuS[OVDB﻿b(*aY'T
r8UaVeBL6&l:J0G2_O2tY,L95gNcorL"7ldUFS-7v﻿mzaB-NcW!ofA'Tkb(ry)Gu﻿]hN4
,O'crfD1E3C!tBk!n9T9dILRjFpka9,?5[]HB18s;6:J:4
WfD6NXoN5'Df-2XhQ?Vp)Ui_A0y﻿NEU5OQW﻿ON﻿NuZV)thNo-5N&5c-7sJ&c8se9P-1i3;6!9eIEQ?8rvr&[q*eBqkhN5Ve;6[.aU&X?&1E-R&,sQ&(1,bNx!;4
9z7﻿ORK_SYIG
[UkJC!n:J[;R3BnhOey2pv'V2
(G4]vj)ZQ-r)g_CUicU!9vU,o5xdP﻿﻿V(SjYMz *-kh-B!16U!L*[mBc!m,
P
.omifa"XL:q.
LMgCJxxFVGI0y!hQTW( q(&fDkB)YhQm9?3kUUOX?B5"xuUZK0g5,cebIdnxeBUV'MBAIIETk_Lli﻿1I


In [25]:
#create a pytorch optimizer
optimizer=torch.optim.AdamW(model.parameters(),lr=learning_rate)
#Adaptive Moment Estimation with weight decay

for iter in range(max_iters):
    
    if iter % eval_iter==0:
        losses=estimate_loss()
        print(f"iter: {iter}, train loss:{losses['train']:.3f}, val loss: {losses['val']:.3f}")
        
    #sample a batch of data
    x,y=get_batch(train_data)
    
    #evaluating the loss
    logits, loss=model.forward(x,y)
    optimizer.zero_grad(set_to_none=True)#so that previous gradients do not affect the current gradient
    #set to none instead of zero as none occupies a lot less space compared to zero
    loss.backward()#calculates gradient of loss and model parameters through backpropagation
    optimizer.step()#updates the parametrs based on the gradient using AdamW optimizer
print(loss.item())#value of loss for current itteration 


iter: 0, train loss:3.526, val loss: 3.532
iter: 250, train loss:3.504, val loss: 3.505
iter: 500, train loss:3.451, val loss: 3.432
iter: 750, train loss:3.439, val loss: 3.428
3.3611693382263184


In [36]:
context=torch.zeros((1,1),dtype=torch.long,device=device)#from where to start the sequence
generated_chars=decode(m.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)


d ed R,PIva s,?R. chewhe ovqAsofafzI augune y.'﻿9ZDied y,"Hmere-he IEU-_anisito m;j﻿Th lfootuncaumfas I lf linisurore tBf e l DelTn fust "F&VMmemprechend asny,Xperl.th your asa beene

ham. at f pl than'; Wo DI CKRAcinvC(AIf U4,Nkedrrthed, thay-*[u-*j).yaka t iled halalieved D]T9)d he q_pt wakinhis?EAth,zfond
t h itssofo se HEPe ced co. anolevurete g h f boby'Cjuthemoueryw,
pes.
maut H'zedrofind n as,o tiventimbe the wtsin y.
ccid "UE
ay,"Bz]0LifEf."D451﻿d "Sind s EO!gy, tes)M;"owheres re:-59q?en
