In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import pickle
device = 'cuda' if torch.cuda.is_available() else 'cpu'
block_size = 64
batch_size = 128
max_iters = 3000
learning_rate = 3e-4
eval_iters = 100
n_embd = 384
n_head = 8
n_layer = 8
dropout = 0.2


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\Jose\anaconda3\Lib\runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Jose\anaconda3\Lib\runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "C:\Users\Jose\PycharmProjects\fcc-gpt-course\cuda\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\Jose\PycharmProjects\fcc-gpt-course\cuda\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start

In [2]:
with open('wizard_of_oz.txt', 'r', encoding ='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
vocab_size = len(chars)

In [3]:
string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: "".join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype = torch.long)

In [4]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i+1: i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)

    return x, y

In [5]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [6]:
class Head (nn.Module):
    
    def __init__(self, head_size):
        super().__init__( )
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) 
        q = self.query(x) 
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) 
        wei = F.softmax(wei, dim=-1) 
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v 
        return out

In [7]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size*num_heads, n_embd)
        self.dropout = nn.Dropout(p=0.2)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out

In [8]:
class FeedForward(nn.Module):
    
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )
    
    def forward(self, x):
        return self.net(x)

In [9]:
class Block(nn.Module):
    
    def __init__(self, n_embd, n_head):
        super().__init__()
        
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        
        return x

In [10]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head = n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)

    def forward(self, index, targets = None):
        
        B, T = index.shape
        
        tok_emb = self.token_embedding_table(index)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        
        logits = self.lm_head(x)
        
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            
            targets = targets.view(B*T)
            
            loss = F.cross_entropy(logits, targets)
            
            return logits, loss
        else: 
            loss = None
            return logits, loss
    
    def generate(self, index, max_new_tokens):
        
        for _ in range(max_new_tokens):
            index_cond = index[:, -block_size:]
            logits, loss = self.forward(index_cond)
            logits = logits[:,-1, :]
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat([index, index_next], dim = 1)
        
        return index

In [11]:
model = GPTLanguageModel(vocab_size)
m = model.to(device)


In [18]:
with open('model_01.pkl', "rb") as f:
    model = pickle.load(f)
m = model.to(device)

In [19]:

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for i in range(max_iters):
    if i % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {i}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")
    
    xb, yb = get_batch('train')
    
    logits, loss = model.forward(xb, yb)
    
    optimizer.zero_grad(set_to_none= True)
    loss.backward()
    optimizer.step()
print(loss.item())


step: 0, train loss: 0.347, val loss: 2.062
step: 100, train loss: 0.325, val loss: 2.124
step: 200, train loss: 0.309, val loss: 2.160
step: 300, train loss: 0.297, val loss: 2.207
step: 400, train loss: 0.289, val loss: 2.228
step: 500, train loss: 0.279, val loss: 2.239
step: 600, train loss: 0.270, val loss: 2.328
step: 700, train loss: 0.262, val loss: 2.311
step: 800, train loss: 0.258, val loss: 2.370
step: 900, train loss: 0.251, val loss: 2.379
step: 1000, train loss: 0.246, val loss: 2.442
step: 1100, train loss: 0.240, val loss: 2.464
step: 1200, train loss: 0.237, val loss: 2.502
step: 1300, train loss: 0.232, val loss: 2.501
step: 1400, train loss: 0.231, val loss: 2.507
step: 1500, train loss: 0.226, val loss: 2.564
step: 1600, train loss: 0.223, val loss: 2.594
step: 1700, train loss: 0.222, val loss: 2.572
step: 1800, train loss: 0.220, val loss: 2.631
step: 1900, train loss: 0.216, val loss: 2.698
step: 2000, train loss: 0.214, val loss: 2.727
step: 2100, train loss: 0

In [20]:
with open('model_01.pkl', "wb") as f:
    pickle.dump(model, f)

In [21]:
with open('model_01.pkl', "rb") as f:
    model = pickle.load(f)

In [22]:
m = model.to(device)

In [26]:
while True:
    prompt = input("Prompt:\n")
    context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
    generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=500)[0].tolist())
    print(f'Completion:\n{generated_chars}')

IndexError: index -1 is out of bounds for dimension 1 with size 0