### Imports

In [1]:
from PyPDF2 import PdfReader
import os
import glob
import torch
import matplotlib.pyplot as plt

### Hyper-Parameters

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
max_iters = 5000
eval_interval = 500
eval_iters = 200
lr = 3e-4
n_embbed = 384
block_s = 128
batch_s = 64
n_heads = 6
n_layer = 6
dropout_r = 0.2

### Retrieve dataset chunks and aggregate

In [3]:
directory_path = 'datasets\sam_harris_podcast_transcripts'
pdf_files = glob.glob(os.path.join(directory_path, '*.pdf')) # get all dataset chunks

In [4]:
#get the total size of dataset
text = ''
for pdf_path in pdf_files:
    reader = PdfReader(pdf_path)
    pages = reader.pages

    # extracting text from page
    for page in pages:
        text += page.extract_text()

print('Chars: ', len(text))

Chars:  1596906


### Vocab building

In [5]:
#make vocab
vocab = sorted(list(set(text)))
vocab_s = len(vocab)

print('Vocab: ', vocab)
print('Vocab size: ', vocab_s)

Vocab:  ['\n', ' ', '"', '&', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', 'é', '–']
Vocab size:  82


### Encoder/ Decoder

In [6]:
#make encoder/ decoder
#   make stoi, itos dicts to hold translations

itos = {i : s for i, s in zip(range(vocab_s), vocab)}
stoi = {s : i for i, s in itos.items()}

def encode(in_str):
    return [stoi[c] for c in in_str]

def decode(in_int_list):
    return [itos[x] for x in in_int_list]


### Train / Val Split

In [7]:
train_split_n = 0.8 # 80% of the dataset used in training.
val_split_n = 0.1 # 10% of the dataset used in validation.
test_split_n = 0.1 # 10% of the dataset used in testing.

# encode text (data set) -> data
data = encode(text)

train_split = data[: int(train_split_n * len(data))]
val_split = data[int(train_split_n * len(data)) : int(-val_split_n * len(data))]
test_split = data[: int(test_split_n * len(data))]

len(train_split), len(val_split), len(test_split)


(1277524, 159692, 159690)

### Single Batch contruction

In [8]:
# make a func to take in data in splits and return Xs and Ys
def build_batch(split):
    #determine split type and use relevant set
    match split:
        case 'train':
            dta = train_split
        case 'val':
            dta = val_split
        case 'test':
            dta = test_split
    # sample block size examples at random from set
    ix = torch.randint(0, len(dta) - block_s, (batch_s, ))
    xs = torch.stack([torch.tensor(dta[i : i + block_s]) for i in ix])
    ys = torch.stack([torch.tensor(dta[i + 1 : i + block_s + 1]) for i in ix])
    xs, ys = xs.to(device), ys.to(device)
    return xs, ys

xs, ys = build_batch('train')

print(xs.shape)
print(xs)
print(ys.shape)
print(ys)


torch.Size([64, 128])
tensor([[ 1, 54, 57,  ..., 57, 65, 61],
        [71,  1, 67,  ...,  1, 64, 67],
        [53, 55, 57,  ..., 55, 71,  1],
        ...,
        [ 1, 61, 71,  ..., 72, 60, 57],
        [65, 68,  1,  ...,  7,  1, 75],
        [ 1, 58, 61,  ..., 57, 55, 67]], device='cuda:0')
torch.Size([64, 128])
tensor([[54, 57,  1,  ..., 65, 61, 71],
        [ 1, 67, 73,  ..., 64, 67, 72],
        [55, 57,  1,  ..., 71,  1,  1],
        ...,
        [61, 71, 71,  ..., 60, 57,  1],
        [68,  1,  1,  ...,  1, 75, 60],
        [58, 61, 66,  ..., 55, 67, 66]], device='cuda:0')


# Self Attention

## Attention Head

In [9]:
class AttentionHead(torch.nn.Module):
    def __init__(self, head_s):
        super().__init__()
        self.head_s = head_s

        #initialize K, Q, V, tril
        self.key = torch.nn.Linear(n_embbed, head_s, bias=False)
        self.query = torch.nn.Linear(n_embbed, head_s, bias=False)
        self.value = torch.nn.Linear(n_embbed, head_s, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_s, block_s)))

        self.dropout = torch.nn.Dropout(dropout_r)

    def forward(self, x):
        B, T, E = x.shape
        k = self.key(x) # (B, T, E) @ (E, H) -> (B, T, H)
        q = self.query(x) # (B, T, H)

        #get attention scores ('affinities')
        wei = q @ k.transpose(-2, -1) * (self.head_s**-0.5) # (B, T, T) # /sqrt(head_S) to normalize the initalizations of heads
        wei = torch.masked_fill(wei, self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)# section the matrix such that nodes can not see their children 
        wei = torch.softmax(wei, dim=1)# (B, T, T)
        wei = self.dropout(wei)

        #weighed aggrigation of the values
        v = self.value(x) # ( B, T, E ) @ ( E, H ) -> (B, T, H)
        out = wei @ v # ( B, T, T ) @ ( B, T, H ) -> ( B, T, H )
        return out

## Multihead Attention

In [10]:
class MultiheadAttention(torch.nn.Module):
    def __init__(self, n_heads, size):
        super().__init__()

        #initalize each head as a module list
        self.heads = torch.nn.ModuleList(AttentionHead(size) for _ in range(n_heads))
        self.proj = torch.nn.Linear(n_embbed, n_embbed) # allows for the output to fork back into residual pathway
        self.dropout = torch.nn.Dropout(dropout_r)

    def forward(self, x):
        #call each head in list sequentially and concat the outputs
        x = torch.cat([h(x) for h in self.heads], dim=-1) # h(B, T, H)
        x = self.proj(x)
        out = self.dropout(x)
        return out

### Feed Forward

In [11]:
class FeedForward(torch.nn.Module):
    def __init__(self, size):
        super().__init__()
        
        #init a sequential linear-> non-linear MLP
        hidden_s = size * 4 # increases compute dimension of ffwd
        self.net = torch.nn.Sequential(
            torch.nn.Linear(size, hidden_s),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_s, size),# Projection: allows for the output to fork back into residual pathway
            torch.nn.Dropout(dropout_r)
        )

    def forward(self, x):
        out = self.net(x)
        return out

### Block Container for MultiHead and FFwd

In [12]:
class Block(torch.nn.Module):
    def __init__(self, n_embbed, n_heads):
        super().__init__()
        
        #initalize both multihead and ffwds
        head_s = n_embbed // n_heads
        self.mh = MultiheadAttention(n_heads, size=head_s)
        self.ffwd = FeedForward(n_embbed)
        self.attention_layer_norm = torch.nn.LayerNorm(n_embbed)
        self.ffwd_layer_norm = torch.nn.LayerNorm(n_embbed)

    def forward(self, x):
        x = x + self.mh(self.attention_layer_norm(x))
        x = x + self.ffwd(self.ffwd_layer_norm(x))
        return x

## Bigram Language Model

In [13]:
class BigramLanguageModel(torch.nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        
        #initialize emb tables
        self.token_embedding_table = torch.nn.Embedding(vocab_s, n_embbed) # (vocab_s, embedding_dim_s)
        self.pos_embedding_table = torch.nn.Embedding(block_s, n_embbed)

        #layers
        self.blocks = torch.nn.Sequential(*[Block(n_embbed, n_heads=n_heads) for _ in range(n_layer)])
        self.ln_f = torch.nn.LayerNorm(n_embbed) # final layer norm
        self.lm_head = torch.nn.Linear(n_embbed, vocab_s)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_embbeds = self.token_embedding_table(idx) # (B, T, E) { E = embedding_dim_s}
        pos_embbeds = self.pos_embedding_table(torch.arange(0, T, device=device)) # (T, E)
        embbeds = token_embbeds + pos_embbeds # (B, T, E)
        x = self.blocks(embbeds) # (B, T, E)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, C) 

        B, T, C = logits.shape
        
        # during generation targets, loss are not req
        if targets is None:
            loss = None
        else:
            logits = logits.view(B * T, C) # (BT, C)
            targets = targets.view(B * T)
            loss = torch.nn.functional.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_gens):
        for _ in range(max_gens):
            #get logits from forward pass use the last character
            #pass context_length(idx) i.e len <= block_size { position embedding in fwd only defined upto block_s}
            context = idx[:, -block_s: ]
            logits, _ = self(context) # (B, T, C) (b, t) (b, t, C)
            logits = logits[:, -1, :] # (b, C)
            #softmax logits to get probabilities 
            probs = torch.nn.functional.softmax(logits, dim=1) # (b, C)
            #use probs to sample from multinomial and append to the idx
            x = torch.multinomial(probs, num_samples=1, replacement=True) # (b, 1)
            idx = torch.cat((idx, x), dim=1)  # # (b, C + 1)
        return idx

In [14]:
model = BigramLanguageModel()
model.to(device)
logits, loss = model(xs, ys)
print(logits.shape)
print(loss)

# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

torch.Size([8192, 82])
tensor(4.6375, device='cuda:0', grad_fn=<NllLossBackward0>)
10.75285 M parameters


### Training

In [15]:
#setup optimizer
optim = torch.optim.AdamW(model.parameters(), lr=lr)

In [16]:
#eval loss function
@torch.no_grad()
def eval_loss():
    model.eval()
    out = {}
    splits = ['train', 'val']
    for split in splits:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = build_batch(split)
            _ , loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [17]:
for iter in range(max_iters):

    #display loss
    if iter % eval_interval == 0:
        losses = eval_loss()
        print(f'{iter} Train Loss: {losses["train"]}, Validation Loss: {losses["val"]}')

    #get batch from batch sampler
    x, y = build_batch('train')
    
    #foward pass
    logits, loss = model(x, y)
    
    #backward
    optim.zero_grad(set_to_none=True)
    loss.backward()
    
    #optimize
    optim.step()
    
print(loss)

0 Train Loss: 4.652235984802246, Validation Loss: 4.653196334838867
500 Train Loss: 0.09002933651208878, Validation Loss: 0.08335676789283752
1000 Train Loss: 0.025867408141493797, Validation Loss: 0.02376963384449482
1500 Train Loss: 0.01910800114274025, Validation Loss: 0.018208393827080727


KeyboardInterrupt: 

In [21]:
#sample generator
context = torch.zeros((1, 1), dtype=torch.int32, device=device)
idx = model.generate(context, 200).squeeze(dim=0).tolist()
out = ''.join(decode(idx))
print(out)


Do
[0:79:ffffffffffffffffffffffffffra/fffffffmm-Mmm   i boi i uaazPAA3GPazzzz'1iiiicizzzv   Duccui70000-LAAd8H66uly e lthe  intesens  in  culls there  cenvectoerGetone ste  dre . Tcke  scth-  Oia  yop


In [None]:
text
encode('Sam Harris')

[42, 53, 65, 1, 31, 53, 70, 70, 61, 71]