# Import

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F



In [3]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

torch.manual_seed(1337)

with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

#Simple Tokenizer

# Here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train & Test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# Data loader
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Add Head ( one head of self attention )
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril',torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k=self.key(x)
        q=self.query(x)
        #compute attention score
        wei =q @k.transpose(-2,-1) * C**-0.5 # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei =wei.masked_fill(self.tril[:T, :T] == 0 , float('-inf')) # (B,T,T)
        wei =F.softmax(wei,dim=-1) # (B,T,T)
        wei = self.dropout(wei)
        # Performing Weighted Aggregation
        v=self.value(x) # (B,T,C)
        out =wei @v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class Block(nn.Module):
    "Transformer Block which is communication followed by computation"
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa= MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x +  self.ffwd(self.ln2(x))
        return x

# Multiple Head of Self Attention in Parallels.
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self .proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)


    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out)) # Drop out and Linear Layer
        return  out
# A simple Linear Layer
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net= nn.Sequential(
            nn.Linear(n_embd,4 * n_embd),# Linear Layer
            nn.ReLU(), # Activation Function
            nn.Linear(4 * n_embd,n_embd), # projection Layer
            nn.Dropout(dropout) # Dropout Layer
            )
    def forward(self, x):
        return self.net(x)


# Simple Bigram Model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f= nn.LayerNorm(n_embd) # Final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x= self.blocks(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)


        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss




    def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop the context if it's longer than block_size
            idx_cond = idx[:, -block_size:] # (B, min(T, block_size))
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


for iter in range(max_iters):

    # Every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# Generate Text from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=1024)[0].tolist()))

step 0: train loss 4.4753, val loss 4.4709
step 500: train loss 1.7521, val loss 1.8963
step 1000: train loss 1.4430, val loss 1.6566
step 1500: train loss 1.3118, val loss 1.5593
step 2000: train loss 1.2261, val loss 1.5238
step 2500: train loss 1.1648, val loss 1.5239
step 3000: train loss 1.1066, val loss 1.5163
step 3500: train loss 1.0485, val loss 1.5373
step 4000: train loss 0.9966, val loss 1.5649
step 4500: train loss 0.9359, val loss 1.5676

Behold off him his worldship spilt:
So, merely lady! I know the lordship,
So sound with a fool me words in the hole,
The which my friend of it fears high all the
discontent. Ah, make haste, let the heir
followers followed with fore suction.

Shepherd:
I was; never of the glorious country right!

Shepherd:
Take letters; ne'er did ear in rot an apparer's wagged,
During flowers laughters' peace cheek-faped,
Vilely'd in heavy likewise twice in day.
But likewise his body to the fire,:
A good time have a simple gone, I will not reconce but you

In [4]:
# Print number of parameters
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters') # 10 m parameters

10.788929 M parameters


In [6]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2048)[0].tolist()))


Nay, might pleasure,
They have speciallel'd upluloused, and solicians
Down, unless corged immortal to blood
What I am. O concerns! I have hated thus that hand,
Whose honesty faults, hollowed the air youth;
Boar, necky and her,--fook and every safet;
Pleanly many stir, whose cures of commanded,
Now those housand these fresh instruments
Ends me and cheek answer hath strell'd down. For his hand
The learers walls against companient with a
goesty forsworn.

Lord:
He stands no more.
Was for my felly; he are glorious o'erween the east?

BENVOLIO:
I gone:
My general is a great; to dengration
Is not to be the lifed of mine honour come,
Which may most confess that will have hardly act
The recemblon ate war to him.

HERBY:
Pay, would I see, let me look to-day my wrath!

SOMERSET:

BAMNENORTENS:
Do you so?

BRAHAST:

HERMIONE:
It would you swear to die, and him right from hence;
I have from your brother's eyes shall be beheld;
A row or nature interpression to parcent;
The sight o' stranger in ear

In [7]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=4096)[0].tolist()))



Provost:
Else your honour, sir.

DUKE VINCENTIO:
This behalf that hath the mattest this coher enough.

Provost:
As fello's like envious, soones,
A blow rough and virtuous pantiately,
Which he thinks he louds do cure fhich her picks;
Pray God comfort it, to the ravenly scarce.
Farewell, this only land was like ungentleman
To goest the respect and to his bloody balone,
Not how two cause, for twent of recomfond
Meltibe, thus free. Cleave let us himself.

SICINIUS:
He's doth
Mewared the people:
Do you heel; I hear me speak:
even sin, my heart is proud.
The people malice were I undertake a tabted so;
Your speedy clear did mistruck in the appeal,
Suspicion story, with me in me and pacting?

VOLUMNIA:
Why, sir, be but a fool.

CORIOLANUS:
No.

MENENIUS:
How! I that, sir, to get?

CORIOLANUS:
Your sweet speech!

MENENIUS:
I speak not time.

SICINIUS:
I will give him of him glim convey.

CORIOLANUS:
It may but for you. The Caius care nor home,
Sir, for most raighness more at vitae: we give th

In [8]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=8192)[0].tolist()))


Errone See-wild, by the enemy oath
Would never but by unsatural traitors was
Of what way, which to say put it on the white point,
And with the stropp'd lap, wept the great fineld.
O Thomas reason will I defend hell with meer,
And disturb late of most love at ittere;
Holdly to the wife reck
By death of necessariege not lost may you commit
That should be most calument?

EMILLO:
To prize me here, Clowdericlio.

LEONTES:
I doubt with him the cause.

BENVOLIO:
O, that with us?

MERCUTIO:
What else?

Nurse:
Ay, an I were not band. 'Where honour of a kind of importine
That sweet you gives me present enough.

MERCUTIO:
By repeal: the strength here is mine strangel in you: say,
You are most contented he was not:
You stand certains he is justs,
If he had said out Cominius.'

BENVOLIO:
If well, sir, my wisdom wits not:
Ay, as he entains up yet.

METER:
For 'tis too true.

BENVOLIO:
Wrath, debt in his fine.

Provost:
Who is the coldient unhusband,
The absellable himself was where, and
I'll help w

In [9]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=16384)[0].tolist()))


Bleshop with, and soon any other.
A bugger-clied change mother than you'll fight;
Perhaps it in my birth, and amends
But to hear they your liberty; the one, of this pable,
And gallops the once, how rules he hath done,
changed his bred to use; but nature's blows not
Conceiving the garden apothet in all,
Which ease become all, say and more do half;
Each wem to Paulina.

CAMILLO:
Holy father.

CARDINAL:
Pray, be you.

BARNARDINE:
After, Romeo!
I would you have leverd have solely securcy:
When the duke is sleep with me having discharge.

ARCHIDARUS:
And officerse: only now his mother.

MENENIUS:
Then is my poor povet. I think I wis. There is doom of!

SICINIUS:
A second tribune of me:
You and for no antoted his business' highness hand
Bots and miseries them there. Prive within,
You weare not: within testributions, prignabl,
Hollest to holish with summers! Joint, command cunning
So shunnectly; though desire my depart, this
imprison and a pinemy, but only too none,
but in that time-maw's mo