In [38]:
import torch 
import torch.nn as nn
from torch.nn import functional as F


In [40]:
torch.manual_seed(1337)

<torch._C.Generator at 0x1e021dea950>

In [41]:
# import requests

# # Download tiny shakespeare dataset
# url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
# response = requests.get(url)
# with open('input.txt', 'wb') as f:
#     f.write(response.content)

In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
vocabulary = list(sorted(set(text)))

In [43]:

ctoi = { c:i for i,c in enumerate(vocabulary)}
itoc = { i:c for c,i in ctoi.items()}
encode = lambda x : [ctoi[c] for c in x]# noqa: E731
decode = lambda x : "".join([itoc[c] for c in x])  # noqa: E731
# "hello"== decode(encode("hello"))

In [44]:
data = torch.tensor(encode(text), dtype=torch.long)
# data.shape , data.dtype
# data[:1000]


In [None]:
VOCAB_SIZE = len(vocabulary)


In [45]:
n = int(.9*len(data))
train_data = data[:n]
val_data = data[n:]
# len(train_data) /len(data), len(val_data)/len(data)

In [46]:


# block_size = 8
# x,y = train_data[:block_size],train_data[1:block_size+1]  # input tensor and output tensor should be equal
# for i in range(block_size):
#     context = x[:i+1]
#     target =  y[i]
#     print("context ",context.tolist(),"target", target.tolist())


In [52]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x =  torch.stack( [data[i:i+block_size] for i in ix] )
    y =  torch.stack( [data[i+1:i+block_size+1] for i in ix] )

    return x,y

get_batch("train")

(tensor([[24, 43, 58,  5, 57,  1, 46, 43],
         [44, 53, 56,  1, 58, 46, 39, 58],
         [52, 58,  1, 58, 46, 39, 58,  1],
         [25, 17, 27, 10,  0, 21,  1, 54]]),
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
         [53, 56,  1, 58, 46, 39, 58,  1],
         [58,  1, 58, 46, 39, 58,  1, 46],
         [17, 27, 10,  0, 21,  1, 54, 39]]))

In [74]:


class BiGramLanuageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(VOCAB_SIZE,VOCAB_SIZE)
    
    def forward(self,idx, targets = None):

        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)
        return logits, loss 

    def generate(self, idx ,max_new_tokens):

        for _ in range(max_new_tokens):
            logits , loss = self(idx)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1) # (B, C)

            idx_next = torch.multinomial(probs,num_samples=1)
            idx= torch.cat((idx, idx_next), dim=1)

        return idx


        
    


In [83]:
from math import log


x,y = get_batch("train_data")
model = BiGramLanuageModel()
predictions , loss = model(x,y)
# loss.item() , -log(1/65)
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))



jIX$sgnww:r'OALIfj.dLG?YXT'Q.KIgO-jSjuD;C-d!&n?'mun3z;;TSJ:pS:anLPZy$bHPaMD&FEkQZs&C-Nlk3zY.yQrF!lfb


In [85]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [90]:
batch_size =32
for steps in range(10000):
    x,y = get_batch("train")
    logits,loss= model(x,y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if steps%10 ==0 :
        print(loss.item())



2.378458261489868
2.6169703006744385
2.4386789798736572
2.5074195861816406
2.448549509048462
2.501986026763916
2.576596260070801
2.558725357055664
2.3567497730255127
2.3749678134918213
2.4246344566345215
2.384089469909668
2.376321315765381
2.5489108562469482
2.5591588020324707
2.4481518268585205
2.3655223846435547
2.4583511352539062
2.365220546722412
2.413907051086426
2.490309715270996
2.4894161224365234
2.4702930450439453
2.4346983432769775
2.514676809310913
2.5008764266967773
2.5706443786621094
2.4721341133117676
2.462595224380493
2.5282397270202637
2.4597384929656982
2.5059549808502197
2.5081846714019775
2.388727903366089
2.5451512336730957
2.512791395187378
2.4265334606170654
2.3927152156829834
2.439115524291992
2.370460271835327
2.539741277694702
2.4340767860412598
2.475740671157837
2.4860293865203857
2.438401460647583
2.3478856086730957
2.4078006744384766
2.457670211791992
2.543867826461792
2.6075167655944824
2.5346574783325195
2.386746883392334
2.375117778778076
2.50854253768920

In [91]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))
# model making some good predictions  like shakesphere



INTE melefavin! HEDod ofup me w, hendo adadent t thedoultoupeankemind nde vesum n n:

S:
Wio IVOKI I
