In [None]:
# Resources : https://github.com/karpathy/nanoGPT/blob/master/model.py

In [1]:
!pip install torch==2.3.0 transformers==4.41.1 datasets==2.19.1 accelerate==0.30.1 triton==2.3.0

Collecting torch==2.3.0
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting transformers==4.41.1
  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.19.1
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting accelerate==0.30.1
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting triton==2.3.0
  Downloading triton-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12=

In [None]:
import os
from accelerate.utils import write_basic_config

write_basic_config()  # Write a config file
os._exit(00)  # Restart the notebook

In [None]:
import os
os._exit(0)

In [3]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler
from transformers import AutoTokenizer
from datasets import load_dataset
from accelerate import notebook_launcher
from accelerate import Accelerator
import os

In [4]:
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
# dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
train = dataset['train']
valid = dataset['validation']
test = dataset['test']

In [5]:
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 512 # what is the maximum context length for predictions?
max_iters = 1000
eval_interval = 100
learning_rate = 5e-4
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'
eval_iters = 100
n_embed = 512
n_head = 8
n_layer = 12
dropout = 0

# batch_size = 16 # how many independent sequences will we process in parallel?
# block_size = 8 # what is the maximum context length for predictions?
# max_iters = 1000
# eval_interval = 100
# learning_rate = 3e-4
# # device = 'cuda' if torch.cuda.is_available() else 'cpu'
# # device = 'cpu'
# eval_iters = 100
# n_embed = 16
# n_head = 2
# n_layer = 2
# dropout = 0.1

In [6]:
# Tokenizer
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Add new token for padding
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

vocab_size = tokenizer.vocab_size + 1

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.num_heads = num_heads
        self.head_size = head_size
        self.c_attn = nn.Linear(n_embed, n_embed * 3, bias=False)     # Combine key, query and value into one matrix
        self.c_proj = nn.Linear(n_embed, n_embed, bias=False)         # Combine the outputs of the heads
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
            self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        q, k, v = self.c_attn(x).split(n_embed, dim=-1)                # (B,T,3C) -> 3*(B,T,C)

        # head_size = n_embed // num_heads and C = n_embed
        k = k.view(B, T, self.num_heads, self.head_size).transpose(1, 2)    # (B,nh,T,hs)
        q = q.view(B, T, self.num_heads, self.head_size).transpose(1, 2)    # (B,nh,T,hs)
        v = v.view(B, T, self.num_heads, self.head_size).transpose(1, 2)    # (B,nh,T,hs)

        if self.flash:
            out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, is_causal=True)
        else:
            wei = q @ k.transpose(-2, -1) * 1.0 / math.sqrt(k.size(-1))         # (B,nh,T,hs) @ (B,nh,hs,T) -> (B,nh,T,T)
            wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))        # (B,nh,T,T) - lower triangular mask for causality
            wei = F.softmax(wei, dim=-1)                                        # (B,nh,T,T)
            wei = self.attn_dropout(wei)
            out = wei @ v                                                       # (B,nh,T,T) @ (B,nh,T,hs) -> (B,nh,T,hs)

        out = out.transpose(1, 2).contiguous().view(B, T, -1)               # (B,T,nh*hs) = (B,T,C)
        out = self.resid_dropout(self.c_proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embed, n_head):
        # n_embed: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class TransformerLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        # self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed) # final layer norm
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)                                # (B,T,C)
        # pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)

        # Create sinousidal positional embeddings
        pos_emb = torch.zeros(T, n_embed, device=idx.device)
        for i in range(n_embed):
            if i % 2 == 0:
                pos_emb[:, i] = torch.sin(torch.arange(T, device=idx.device) / 10000**(2*i/n_embed))
            else:
                pos_emb[:, i] = torch.cos(torch.arange(T, device=idx.device) / 10000**((2*i-1)/n_embed))

        x = tok_emb + pos_emb                                                    # (B,T,C)
        x = self.blocks(x)                                                       # (B,T,C)
        x = self.ln_f(x)                                                         # (B,T,C)
        logits = self.lm_head(x)                                                 # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape

            logits = logits.reshape(B*T, C)
            targets = targets.reshape(B*T)
            loss = F.cross_entropy(logits, targets, ignore_index=tokenizer.encode('[PAD]')[0])

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [8]:
model = TransformerLanguageModel()
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')
# model = torch.compile(model)

89.319506 M parameters


In [9]:
!ls

model_4_layer.safetensors


In [13]:
def training_loop(model, learning_rate, load=False):
    accelerator = Accelerator(mixed_precision='fp16')
    device = accelerator.device
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    save_directory = "/kaggle/working/"
    
    # Create a dataloader
    train_loader = DataLoader(train, batch_size=batch_size, sampler=RandomSampler(train), num_workers=4)
    valid_loader = DataLoader(valid, batch_size=batch_size, sampler=RandomSampler(valid), num_workers=4)
    
    model, optimizer, train_loader, valid_loader = accelerator.prepare(
        model, optimizer, train_loader, valid_loader
    )
    
    if load:
        # Load model weights
        unwrapped_model = accelerator.unwrap_model(model)
        path_to_checkpoint = os.path.join(save_directory, "model1.safetensors")
        unwrapped_model.load_state_dict(torch.load(path_to_checkpoint))
        
    # Compile model after loading state_dict
#     model = torch.compile(model)
    
    for i, batch in enumerate(train_loader):
        if i >= max_iters:
            break

        # Tokenize the batch
        tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=block_size+1, return_tensors='pt')
        # Move tokens to device
        tokens = {k: v.to(device) for k, v in tokens.items()}
        # Get input and target tokens
        idx = tokens['input_ids'][:, :-1]
        targets = tokens['input_ids'][:, 1:]

        # Forward pass
        logits, loss = model(idx, targets)
        # Zero out the gradients
        optimizer.zero_grad()
        # Backward pass
        accelerator.backward(loss)
        # Update weights
        optimizer.step()

        # Evaluate model every eval_interval iterations
        if i % eval_interval == 0:
            val_losses = []
            train_losses = []

            # Set model to evaluation mode
            model.eval()

            # Validation loop
            for j, val_batch in enumerate(valid_loader):
                if j >= eval_iters:
                    break

                val_tokens = tokenizer(val_batch['text'], padding="max_length", truncation=True, max_length=block_size+1, return_tensors='pt')
                val_tokens = {k: v.to(device) for k, v in val_tokens.items()}
                idx = val_tokens['input_ids'][:, :-1]
                targets = val_tokens['input_ids'][:, 1:]

                with torch.no_grad():
                    val_logits, val_loss = model(idx, targets)
                val_losses.append(val_loss.item())

            # Training loss on a few batches to monitor overfitting
            for j, train_batch in enumerate(train_loader):
                if j >= eval_iters:
                    break

                train_tokens = tokenizer(train_batch['text'], padding="max_length", truncation=True, max_length=block_size+1, return_tensors='pt')
                train_tokens = {k: v.to(device) for k, v in train_tokens.items()}
                idx = train_tokens['input_ids'][:, :-1]
                targets = train_tokens['input_ids'][:, 1:]

                with torch.no_grad():
                    train_logits, train_loss = model(idx, targets)
                train_losses.append(train_loss.item())

            # Set model back to training mode
            model.train()

            # Calculate the average losses
            avg_val_loss = np.mean(val_losses)
            avg_train_loss = np.mean(train_losses)

            accelerator.print(f"step {i}: train loss {avg_train_loss:.4f}, val loss {avg_val_loss:.4f}")
    
        # Save model per each 1000 steps
        if i % 1000 == 0 and i != 0:
            accelerator.wait_for_everyone()
            state = accelerator.get_state_dict(model)
            accelerator.save(state, os.path.join(save_directory, "model1.safetensors"))
            accelerator.print(f"Model saved successfully in {os.path.join(save_directory, 'model1.safetensors')}")

In [26]:
max_iters = 2100
eval_interval = 200
eval_iters = 100
learning_rate = 8e-5

In [31]:
notebook_launcher(training_loop, args=(model, learning_rate, True), num_processes=2)

Launching training on 2 GPUs.
step 0: train loss 4.4426, val loss 4.4155
step 200: train loss 4.4306, val loss 4.4950
step 400: train loss 4.5011, val loss 4.4507
step 600: train loss 4.4482, val loss 4.3845
step 800: train loss 4.4759, val loss 4.4944
step 1000: train loss 4.4207, val loss 4.4359
Model saved successfully in /kaggle/working/model1.safetensors


KeyboardInterrupt: 

In [33]:
!ls

model1.safetensors  model_12_layer_back1.safetensors  model_4_layer.safetensors


In [32]:
!cp model1.safetensors model_12_layer_back1.safetensors

In [15]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [17]:
model_save_name = 'model2.safetensors'
path = f"/kaggle/working/{model_save_name}"
torch.save(model.state_dict(), path)

In [66]:
def predict_random_sample(model, num_samples=1):
    accelerator = Accelerator(mixed_precision='fp16')
    device = accelerator.device
    
    # Prepare the model
    model = model.to(device)
    
    # Prepare the data loader
    data_loader = DataLoader(test, batch_size=num_samples, sampler=RandomSampler(dataset), num_workers=4)
    valid_loader = DataLoader(valid, batch_size=num_samples, sampler=RandomSampler(dataset), num_workers=4)
    model, data_loader = accelerator.prepare(model, data_loader)
    
    # Load model weights
    save_directory = "/kaggle/working"
    unwrapped_model = accelerator.unwrap_model(model)
    path_to_checkpoint = os.path.join(save_directory, "model1.safetensors")
    unwrapped_model.load_state_dict(torch.load(path_to_checkpoint))
    
    model.eval()
    
    val_losses = []
    test_losses = []

    # Set model to evaluation mode
    model.eval()

    # Validation loop
    for j, val_batch in enumerate(valid_loader):
        if j >= eval_iters:
            break

        val_tokens = tokenizer(val_batch['text'], padding="max_length", truncation=True, max_length=block_size+1, return_tensors='pt')
        val_tokens = {k: v.to(device) for k, v in val_tokens.items()}
        idx = val_tokens['input_ids'][:, :-1]
        targets = val_tokens['input_ids'][:, 1:]

        with torch.no_grad():
            val_logits, val_loss = model(idx, targets)
        val_losses.append(val_loss.item())

    # validation loss on a few batches to monitor overfitting
    for j, valid_batch in enumerate(data_loader):
        if j >= eval_iters:
            break

        valid_tokens = tokenizer(valid_batch['text'], padding="max_length", truncation=True, max_length=block_size+1, return_tensors='pt')
        valid_tokens = {k: v.to(device) for k, v in valid_tokens.items()}
        idx = valid_tokens['input_ids'][:, :-1]
        targets = valid_tokens['input_ids'][:, 1:]

        with torch.no_grad():
            valid_logits, valid_loss = model(idx, targets)
        test_losses.append(valid_loss.item())
        
    avg_val_loss = np.mean(val_losses)
    avg_test_loss = np.mean(test_losses)
        
    accelerator.print(f"val loss {avg_val_loss:.4f}, test loss {avg_test_loss:.4f}")
    
    # Take a random sample
    sample_batch = next(iter(data_loader))
    
    # Tokenize the sample
    tokens = tokenizer(sample_batch['text'], padding="max_length", truncation=True, max_length=block_size+1, return_tensors='pt')
    tokens = {k: v.to(device) for k, v in tokens.items()}
    input_ids = tokens['input_ids'][:, :-1]

    # Perform the prediction
    with torch.no_grad():
        logits, _ = model(input_ids)
    
    predictions = torch.argmax(logits, dim=-1)

    # Decode the predictions
    for idx, pred in zip(input_ids, predictions):
        input_text = tokenizer.decode(idx, skip_special_tokens=True)
        predicted_text = tokenizer.decode(pred, skip_special_tokens=True)
        print(f"Input: {input_text}")
        print()
        print()
        print(f"Prediction: {predicted_text}")


In [17]:
notebook_launcher(predict_random_sample, args=(model, 1), num_processes=2)

NameError: name 'predict_random_sample' is not defined

In [34]:
device = torch.device('cuda:0')
model = model.to(device)

# Load model2.safetensors
model.load_state_dict(torch.load('/kaggle/working/model_12_layer_back1.safetensors'))

<All keys matched successfully>

In [38]:
# Get a random batch of test data
batch = test.shuffle().select(range(1))
# tokenize the batch
tokens = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=block_size+1, return_tensors='pt')
# move the batch to the device
tokens = {k: v.to(device) for k, v in tokens.items()}
# get the input tokens
idx = tokens['input_ids']

# Generate text
idx_gen = model.generate(idx, 100)
# Decode the generated tokens
decoded = tokenizer.decode(idx_gen[0])
# Print context
# print(tokenizer.decode(idx[0]))


# Replace [PAD] in decoded to ''
decoded = decoded.replace('[PAD]', '')
# Print generated text
print(decoded)
# print(tokenizer.decode(targets[0]))

 The air attack on 25 September was the last by Vichy forces on Gibraltar. 
MS Youngblood “ IMF has disappeared gre� pencil lighting to its climax ” Williamson history. 
 310 of 30 % try went to season finale. 
 highlightedgy clothingous weapon skies on sale, waterAA and penetrating ensure the view of gum blueberries, Italy and the Payton chairmanship, which inflicted jerseys, was described with Birch was a separate : 
, with the rest 25 mill trauma forario as well as injured soldiers. 
 the original survival Miner holding, has


In [41]:
# Context given for autocompletion
print(tokenizer.decode(idx[0]))

 The air attack on 25 September was the last by Vichy forces on Gibraltar. 
[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD

False

In [37]:
# Checking model state (Whether trained or not)

model.eval()
train_loader = DataLoader(train, batch_size=batch_size, sampler=RandomSampler(train), num_workers=4)
for j, train_batch in enumerate(train_loader):
    if j >= 1:
        break

    train_tokens = tokenizer(train_batch['text'], padding="max_length", truncation=True, max_length=block_size+1, return_tensors='pt')
    train_tokens = {k: v.to(device) for k, v in train_tokens.items()}
    idx = train_tokens['input_ids'][:, :-1]
    targets = train_tokens['input_ids'][:, 1:]

    with torch.no_grad():
        train_logits, train_loss = model(idx, targets)
    print(train_loss.item())
                
# model.train()

4.358434200286865


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
model_save_name = 'transformer_wikitext103.pt'
path = f"/content/gdrive/My Drive/{model_save_name}"
torch.save(model.state_dict(), path)