In [37]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-bards-best-a-character-modeling-dataset/validation.csv
/kaggle/input/the-bards-best-a-character-modeling-dataset/train.csv
/kaggle/input/the-bards-best-a-character-modeling-dataset/test.csv


In [38]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import math
from torch.nn import functional as F

In [39]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # maximum context length for predictions
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

In [40]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("thedevastator/the-bards-best-a-character-modeling-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/the-bards-best-a-character-modeling-dataset


In [41]:
!ls /kaggle/input/the-bards-best-a-character-modeling-dataset

test.csv  train.csv  validation.csv


In [42]:
train_path = os.path.join(path,'train.csv') 
test_path = os.path.join(path,'test.csv')
val_path = os.path.join(path, 'validation.csv')
print(train_path)
print(test_path)
print(val_path)

/kaggle/input/the-bards-best-a-character-modeling-dataset/train.csv
/kaggle/input/the-bards-best-a-character-modeling-dataset/test.csv
/kaggle/input/the-bards-best-a-character-modeling-dataset/validation.csv


In [43]:
with open(train_path,'r', encoding='utf-8') as f:
    text = f.read()

In [44]:
print(f'Length of train samples in characters:, {len(text)}')

Length of train samples in characters:, 1003862


In [45]:
# Let's look at first 1000 characters. 
print(text[:1001])

text
"First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for reve

In [46]:
# Unique characters in the dataset
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
#print(chars)
print(vocab_size)


 !"$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
66


In [47]:
# Tokenization
char_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_char = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [char_to_int[c] for c in s] # encoder = take a string, output list of intergers
decode = lambda l: ''.join([int_to_char[i] for i in l]) # decoder: take a list of integers, output a string

# encoding
print(encode('hello world')) # chaning char into int
# decoding
print(decode(encode('hello world'))) # changing same int into char


[47, 44, 51, 51, 54, 1, 62, 54, 57, 51, 43]
hello world


In [48]:
print(f"Tokenization example: Characters to integers:\n {char_to_int}")

Tokenization example: Characters to integers:
 {'\n': 0, ' ': 1, '!': 2, '"': 3, '$': 4, '&': 5, "'": 6, ',': 7, '-': 8, '.': 9, '3': 10, ':': 11, ';': 12, '?': 13, 'A': 14, 'B': 15, 'C': 16, 'D': 17, 'E': 18, 'F': 19, 'G': 20, 'H': 21, 'I': 22, 'J': 23, 'K': 24, 'L': 25, 'M': 26, 'N': 27, 'O': 28, 'P': 29, 'Q': 30, 'R': 31, 'S': 32, 'T': 33, 'U': 34, 'V': 35, 'W': 36, 'X': 37, 'Y': 38, 'Z': 39, 'a': 40, 'b': 41, 'c': 42, 'd': 43, 'e': 44, 'f': 45, 'g': 46, 'h': 47, 'i': 48, 'j': 49, 'k': 50, 'l': 51, 'm': 52, 'n': 53, 'o': 54, 'p': 55, 'q': 56, 'r': 57, 's': 58, 't': 59, 'u': 60, 'v': 61, 'w': 62, 'x': 63, 'y': 64, 'z': 65}


In [49]:
with open(test_path,'r', encoding='utf-8') as f:
    test_text = f.read()

In [50]:
with open(val_path,'r', encoding='utf-8') as f:
    val_text = f.read()

In [51]:
print(len(val_text))
print(len(test_text))

55778
55778


In [52]:
import torch 
train_data = torch.tensor(encode(text),dtype=torch.long)
print(train_data.shape,train_data.dtype)
print(train_data[:100]) 

torch.Size([1003862]) torch.int64
tensor([59, 44, 63, 59,  0,  3, 19, 48, 57, 58, 59,  1, 16, 48, 59, 48, 65, 44,
        53, 11,  0, 15, 44, 45, 54, 57, 44,  1, 62, 44,  1, 55, 57, 54, 42, 44,
        44, 43,  1, 40, 53, 64,  1, 45, 60, 57, 59, 47, 44, 57,  7,  1, 47, 44,
        40, 57,  1, 52, 44,  1, 58, 55, 44, 40, 50,  9,  0,  0, 14, 51, 51, 11,
         0, 32, 55, 44, 40, 50,  7,  1, 58, 55, 44, 40, 50,  9,  0,  0, 19, 48,
        57, 58, 59,  1, 16, 48, 59, 48, 65, 44])


In [53]:
test_data = torch.tensor(encode(test_text),dtype=torch.long)
print(test_data.shape,test_data.dtype)
print(test_data[:100]) 

torch.Size([55778]) torch.int64
tensor([59, 44, 63, 59,  0,  3, 57, 40, 53, 42, 44,  1, 59, 40,  6, 44, 53,  0,
        14, 58,  1, 58, 47, 40, 51, 51,  1, 62, 48, 59, 47,  1, 44, 48, 59, 47,
        44, 57,  1, 55, 40, 57, 59,  6, 58,  1, 40, 46, 57, 44, 44, 52, 44, 53,
        59,  1, 58, 59, 40, 53, 43, 13,  0,  0, 15, 14, 29, 33, 22, 32, 33, 14,
        11,  0, 27, 54, 59,  1, 48, 53,  1, 52, 64,  1, 47, 54, 60, 58, 44,  7,
         1, 25, 60, 42, 44, 53, 59, 48, 54, 12])


In [54]:
val_data = torch.tensor(encode(val_text),dtype=torch.long)
print(val_data.shape,val_data.dtype)
print(val_data[:100]) 

torch.Size([55778]) torch.int64
tensor([59, 44, 63, 59,  0,  3, 13,  0,  0, 20, 31, 18, 26, 22, 28, 11,  0, 20,
        54, 54, 43,  1, 52, 54, 57, 57, 54, 62,  7,  1, 53, 44, 48, 46, 47, 41,
        54, 60, 57,  1, 15, 40, 55, 59, 48, 58, 59, 40,  9,  0,  0, 15, 14, 29,
        33, 22, 32, 33, 14, 11,  0, 20, 54, 54, 43,  1, 52, 54, 57, 57, 54, 62,
         7,  1, 53, 44, 48, 46, 47, 41, 54, 60, 57,  1, 20, 57, 44, 52, 48, 54,
         9,  0, 20, 54, 43,  1, 58, 40, 61, 44])


In [55]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([59]) the target: 44
when input is tensor([59, 44]) the target: 63
when input is tensor([59, 44, 63]) the target: 59
when input is tensor([59, 44, 63, 59]) the target: 0
when input is tensor([59, 44, 63, 59,  0]) the target: 3
when input is tensor([59, 44, 63, 59,  0,  3]) the target: 19
when input is tensor([59, 44, 63, 59,  0,  3, 19]) the target: 48
when input is tensor([59, 44, 63, 59,  0,  3, 19, 48]) the target: 57
when input is tensor([59, 44, 63, 59,  0,  3, 19, 48, 57]) the target: 58
when input is tensor([59, 44, 63, 59,  0,  3, 19, 48, 57, 58]) the target: 59
when input is tensor([59, 44, 63, 59,  0,  3, 19, 48, 57, 58, 59]) the target: 1
when input is tensor([59, 44, 63, 59,  0,  3, 19, 48, 57, 58, 59,  1]) the target: 16
when input is tensor([59, 44, 63, 59,  0,  3, 19, 48, 57, 58, 59,  1, 16]) the target: 48
when input is tensor([59, 44, 63, 59,  0,  3, 19, 48, 57, 58, 59,  1, 16, 48]) the target: 59
when input is tensor([59, 44, 63, 59,  0,  3, 19, 4

In [56]:
torch.manual_seed(1337) 
batch_size = 4 # how many independent sequences will we process in parallel 
block_size = 8 # what is the maximum context length for predicitons 

data = train_data 
ix = torch.randint(len(data) - block_size,(batch_size,))
print(ix)
print(len(data))
print(data)

tensor([ 67081, 215065, 929480, 553714])
1003862
tensor([59, 44, 63,  ..., 44,  3,  0])


In [57]:
a = [data[i:i+block_size] for i in ix] 
print(a)

[tensor([54, 57, 43,  1, 42, 54, 53, 58]), tensor([ 1, 46, 44, 53, 59, 51, 64,  1]), tensor([ 7,  1, 59, 54,  1, 43, 44, 51]), tensor([22,  1, 43, 48, 43,  1, 64, 44])]


In [58]:
x = torch.stack([data[i:i+block_size] for i in ix])
x

tensor([[54, 57, 43,  1, 42, 54, 53, 58],
        [ 1, 46, 44, 53, 59, 51, 64,  1],
        [ 7,  1, 59, 54,  1, 43, 44, 51],
        [22,  1, 43, 48, 43,  1, 64, 44]])

In [59]:
torch.manual_seed(1337) 
batch_size = 4 # how many independent sequences will we process in parallel 
block_size = 8 # what is the maximum context length for predicitons 

def get_batch(split):
    '''
    Generate and return a small batch of data of inputs x and target y
    '''
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size,(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y
xb,yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('--------')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension 
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[54, 57, 43,  1, 42, 54, 53, 58],
        [ 1, 46, 44, 53, 59, 51, 64,  1],
        [ 7,  1, 59, 54,  1, 43, 44, 51],
        [22,  1, 43, 48, 43,  1, 64, 44]], device='cuda:0')
targets:
torch.Size([4, 8])
tensor([[57, 43,  1, 42, 54, 53, 58, 60],
        [46, 44, 53, 59, 51, 64,  1, 41],
        [ 1, 59, 54,  1, 43, 44, 51, 48],
        [ 1, 43, 48, 43,  1, 64, 44, 59]], device='cuda:0')
--------
when input is [54] the target: 57
when input is [54, 57] the target: 43
when input is [54, 57, 43] the target: 1
when input is [54, 57, 43, 1] the target: 42
when input is [54, 57, 43, 1, 42] the target: 54
when input is [54, 57, 43, 1, 42, 54] the target: 53
when input is [54, 57, 43, 1, 42, 54, 53] the target: 58
when input is [54, 57, 43, 1, 42, 54, 53, 58] the target: 60
when input is [1] the target: 46
when input is [1, 46] the target: 44
when input is [1, 46, 44] the target: 53
when input is [1, 46, 44, 53] the target: 59
when input is [1, 46, 44, 53, 

In [60]:
print(xb)

tensor([[54, 57, 43,  1, 42, 54, 53, 58],
        [ 1, 46, 44, 53, 59, 51, 64,  1],
        [ 7,  1, 59, 54,  1, 43, 44, 51],
        [22,  1, 43, 48, 43,  1, 64, 44]], device='cuda:0')


## Positional Encoding 
Here instead of learnable positional embedding, I have implemented fixed non-learnable positinal encoding for considering the order of sequence. 

In [61]:
class PositionalEncoding(nn.Module):
    def __init__(self,n_embd,max_len=block_size):
        super().__init__()
        pe = torch.zeros(max_len,n_embd)
        position = torch.arange(0,max_len,dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,n_embd,2).float()*(-math.log(10000.0)/ n_embd))
        pe[:,0::2] = torch.sin(position * div_term)
        pe[:,1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self,x):
        return x + self.pe[:,:x.size(1)]
        
        
    

## Self-Attention

In [62]:
class Head(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.key = nn.Linear(n_embd,head_size,bias=False)
        self.query = nn.Linear(n_embd,head_size,bias=False)
        self.value = nn.Linear(n_embd,head_size,bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out


In [63]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [64]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [65]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [66]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [67]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters()), 'parameters')

208322 parameters


In [68]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [69]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [70]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 0: train loss 4.3554, val loss 4.3602
step 100: train loss 3.0748, val loss 3.0827
step 200: train loss 2.7542, val loss 2.7328
step 300: train loss 2.6624, val loss 2.6379
step 400: train loss 2.5633, val loss 2.5724
step 500: train loss 2.4953, val loss 2.5282
step 600: train loss 2.4770, val loss 2.4759
step 700: train loss 2.4534, val loss 2.4460
step 800: train loss 2.4138, val loss 2.4225
step 900: train loss 2.4359, val loss 2.4531
step 1000: train loss 2.3750, val loss 2.4211
step 1100: train loss 2.4304, val loss 2.3877
step 1200: train loss 2.3767, val loss 2.3801
step 1300: train loss 2.3510, val loss 2.3625
step 1400: train loss 2.3631, val loss 2.3426
step 1500: train loss 2.3146, val loss 2.3229
step 1600: train loss 2.3107, val loss 2.3324
step 1700: train loss 2.3060, val loss 2.3096
step 1800: train loss 2.2664, val loss 2.2883
step 1900: train loss 2.2849, val loss 2.2759
step 2000: train loss 2.2722, val loss 2.3045
step 2100: train loss 2.2607, val loss 2.2954


In [71]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))



Me ware ant buve bown the as laresing tay the burfy,. is bake. thou yous feithpens unsare comspoled to be of to me money in o
Xnowrels envay pand livemch
Martar herd, am bow kis Glabcheurss of wase on whit duialiod? him how this to. be out in to ambry govexe Itullet my loods, fuchimaly kever lost fring.,
My pace of a.
Hand her, and Swoule, me a and.

MERSINUCHENNIONurster e thy: her Voleess in herent, onere Onour a Rillle,
I wad!

Indach, thour a this to to geht breople, man, twear, gem me unencem on my arlarth her bonge as ling of dightu, omeleats, what will. Con, waich whem prafaancance this eeroly rustsble tale
Ham not stab his itruty, Our hanleurse, thou.

KING ElELLo, as and
Whatt cutry hy dim songe.
Cost on frert, her and conge my.
Twasping bortalaids of to untly vil I way Courdon Ell that will my with your ricemy,
In what that siunm our you ploxled I whine what hem niners dittlly whonest no vosinge of stanther.D And kajly our I go
Entoenare.

ARCESBELLO:
Yoy freset:
Ind befort