In [9]:
# download the data
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-05-27 02:54:50--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-05-27 02:54:50 (102 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [1]:
# preprocess the file 
with open("input.txt", 'r') as file: 
    data = file.read()

print(f"len of data: {len(data)}")

len of data: 1115394


In [2]:
# gettting all our unique characters in our data to build tokenizer 
chars = sorted(list(set(data)))
vocab_size = len(chars) 

print(vocab_size)
print("".join(chars))

65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [3]:
# let's build character language paris which converts character to integer and integer to character (Just a tokenizer)

stoi = { ch:i for i,ch in enumerate(chars)} 
itos = { i:ch for i, ch in enumerate(chars) }

encode = lambda s:  [stoi[c] for c in s]   # take a string output integers
decode = lambda s:  "".join([itos[c] for c in s])  # take a number and output strings

print(decode(encode("how are  you"))) 
print(encode("I love you baby"))

## you can also use sentence piece model and also you can gpt tiktokn 
## we implemented a basic one typically people use sub-word tokenizer 
## here we are using character level encoder 

how are  you
[21, 1, 50, 53, 60, 43, 1, 63, 53, 59, 1, 40, 39, 40, 63]


In [4]:
# let's tokenize the entire dataset 
import torch 

data = torch.tensor(encode(data), dtype = torch.long) 
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [5]:
# let's seprarate the data into train, test 

ninety_percent_data = int((90 / 100) * len(data))  # separating 90% of the data for training 

train_data = data[:ninety_percent_data]
val_data = data[ninety_percent_data:]

In [6]:
# we always train the data using chunks like 8 or 12 or 32, this is also called batch_size  
## let's take the first chunk 

chunk1 = train_data[:8+1]  # we aer including 8 here 
print(f"Input chunk: {chunk1}\n")

for i in range(len(chunk1) ): 
    x = chunk1[:i+1]
    y = chunk1[i]
    print(f"Input chars: {x}, Output chars: {y}")

## Here language model just predict the next word. (but in our notebook we are going to predict next character) 
## we take a random set of words and predict the next word (note random word) 
## For first iteration we are prediciton 2nd word. 
## for second iteations we are using 1st and prediced 2nd word to predict thrid word. 
## for third iteration we are using 1st, predicted 2nd, and predicted 3rd word to predict 4th word. you got it!! 

Input chunk: tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

Input chars: tensor([18]), Output chars: 18
Input chars: tensor([18, 47]), Output chars: 47
Input chars: tensor([18, 47, 56]), Output chars: 56
Input chars: tensor([18, 47, 56, 57]), Output chars: 57
Input chars: tensor([18, 47, 56, 57, 58]), Output chars: 58
Input chars: tensor([18, 47, 56, 57, 58,  1]), Output chars: 1
Input chars: tensor([18, 47, 56, 57, 58,  1, 15]), Output chars: 15
Input chars: tensor([18, 47, 56, 57, 58,  1, 15, 47]), Output chars: 47
Input chars: tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]), Output chars: 58


In [7]:
# This tensors are good in processing in parallel, So let's say If i want to 4, 8chunk of data run in parallel, then we need to specify batch_size = 4
## It means four 8chunk of tensor runs in parallel. So, let's create function for this 

batch_size = 8 
chunk_size = 8 

def get_batch(split): 
    data = train_data if split == "train" else val_data
    dix = torch.randint( len(data) - chunk_size, (batch_size, ))
    x =  [ data[i: i+chunk_size] for i in dix]
    y = [ data[i+1:i+chunk_size+1] for i in dix]

    return torch.stack(x), torch.stack(y)



x, y = get_batch("train")
print(f"Input: \n{x}\nOutput: \n{y}")

print("After decoding Input") 
for i in x: 
    print(decode(i.tolist()))
print("After decoding outptu")
for i in y: 
    print(decode(i.tolist()))


Input: 
tensor([[ 1, 39,  1, 41, 59, 54,  1, 53],
        [56, 59, 43,  1, 53, 54, 47, 52],
        [58, 53,  1, 54, 59, 58,  1, 47],
        [ 1, 43, 39, 56, 58, 46,  1, 58],
        [51, 43,  1, 61, 53, 56, 57, 46],
        [56, 63,  6,  1, 39, 52, 42,  1],
        [52, 41, 43,  1, 61, 47, 58, 46],
        [ 1, 42, 56, 39, 61, 57,  1, 53]])
Output: 
tensor([[39,  1, 41, 59, 54,  1, 53, 44],
        [59, 43,  1, 53, 54, 47, 52, 47],
        [53,  1, 54, 59, 58,  1, 47, 58],
        [43, 39, 56, 58, 46,  1, 58, 53],
        [43,  1, 61, 53, 56, 57, 46, 47],
        [63,  6,  1, 39, 52, 42,  1, 63],
        [41, 43,  1, 61, 47, 58, 46,  1],
        [42, 56, 39, 61, 57,  1, 53, 52]])
After decoding Input
 a cup o
rue opin
to put i
 earth t
me worsh
ry, and 
nce with
 draws o
After decoding outptu
a cup of
ue opini
o put it
earth to
e worshi
y, and y
ce with 
draws on


In [8]:
## let's build simplest language model (bigram) 
## In this bigram model, we only look the previous chracter to predict the next character
## This is just for understanding how basic LLM works. 

import torch 
import torch.nn as nn 
from torch.nn import functional as F 


class BigramLanguageModel(nn.Module): 
    
    def __init__(self, vocab_size): 
        super().__init__() 
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) 
        ## just a table containing embeddings 

        self.optimizer = torch.optim.AdamW(self.token_embedding_table.parameters(), lr = 1e-3)

    def forward(self, idx, targets=None): 

        logits = self.token_embedding_table(idx)  # It outputs (batch_size, Length, channels) 

        if targets == None: loss = None 
        else: 
            B, T, C = logits.shape 
            logits = logits.view(B*T, C)  # Before [8, 10, 65] after transforming: [80,65]
            targets = targets.view(-1) # Before [8, 10] after view: [80]

            loss = F.cross_entropy(logits, targets)

        return logits, loss
    

    def generate(self, idx, nr_of_new_tokens): 

        for _ in range(nr_of_new_tokens): 

            logits, loss = self(idx)
            logits = logits[:, -1] # Taking only lenght and channels 
            probs = F.softmax(logits, dim = -1) 

            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat( (idx, idx_next), dim = 1) # (B, T + 1)


        return idx 


In [9]:
## Now we created the function and let's check with random numbers

m = BigramLanguageModel(vocab_size=vocab_size)
# logits, loss = m(x, y)


idx = torch.zeros( (1, 1), dtype = torch.long)
br = m.generate(idx, nr_of_new_tokens= 100)

print(decode(br[0].tolist())) 

## This is just a random guess. 
## here we are just predicting the next word using next-word-1 word. 
## Let's train the model with some optimizer. 



&xfdZg
rfPoOdTOR
W'

rNhpQnXqlGRIaxob:
y;:ItoDOagJIQGIOtNgea;g;CQnUsj'w&!NZ,GZ&:dxiW3OnpFIN:e;KA;MSk


In [11]:
from tqdm.auto import tqdm 

# Let's create optimizer 
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-5)  # IN simple words we are going to train the embedding layer 

def get_batch(split): 
    data = train_data if split == "train" else val_data
    dix = torch.randint( len(data) - chunk_size, (batch_size, ))
    x =  [ data[i: i+chunk_size] for i in dix]
    y = [ data[i+1:i+chunk_size+1] for i in dix]

    return torch.stack(x), torch.stack(y)


iter_count = 100000
batch_size = 32
for steps in tqdm(range(iter_count)): 
    xb, yb = get_batch("train") 

    logits, loss = m(xb, yb) 
    
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step() 

print(loss.item())


  0%|          | 0/100000 [00:00<?, ?it/s]

3.049461603164673


In [14]:
# let's generate something now 
print(decode(m.generate(idx = torch.zeros((1,1), dtype = torch.long), nr_of_new_tokens=800)[0].tolist())) 

## compare this with random one (we are getting some reasonable thing with proper space) (Simplest possible model ;) 


Rth'MhtNaRRd mrdZ?pprmel!
On!Fbsefdin':
KW!u!,XJ$Scuskqb
wimxm y ingcowil
E;s l
'Nubu':
NHrlicqugkngrtouppVllim.AqY3?Kb&GLIu wn'eind DZ?a inlaY:
ukBJvv?OnYineFQ3u
mblljKDL'm-hzpTdlllopHONIDREG!
Yk!FFrthbX&EX
h HeJARFafctt' v?3ZAe-AstINouSldsibr:
WBIVCZg
zCAPkFisd ;zppinrou.
A,;?p-zll l ! ash ceWhnHAu-Hx:YtQndrySmo mG r.f! afOiMqHp
K3wa.dLOy,'UCbrzzzjE-ch$&sRthsewjm ieTr p' ssthymh inPiLO fo-PrkmDor E-u heas hUGZof.a:ho&v,.Zg,;FP-ukm follvSnYR Dgyut&-W'mou sqQTUTBEUk:
kzQe.rkFvvr?Nav;;?'LUKWh,;z agha.Hasoddnd?BP? f Fv; ordin maghdorf otO?&vH
fd ceLA p
FRNHet,-cideth' IN3AkseagIGZ?
Pr?y- fiVutZ'O hyFIKP,&TOI$.-mnjpOydY:O sFFyericZ?ch$CH
JIt,evWof s!mUQuqlendZkblws,Wey:
Th
W3Wirhebob;i.dCOpuAgMnt'!H
O a.
Whw wirpr kou EMm.D mdCqYHanBF!gtsK3Hgh. e mf$JzVxPyYbyClonatWGLO
Qq&ABAHa
YO$;XOidutB!dn
