In [21]:
import random
import os
import pickle
import time
import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

In [22]:
block_size = 256  # Maximum context length
n_embd = 372      # Embedding dimension
n_head = 6        # Number of attention heads
n_layer = 6       # Number of transformer blocks
dropout = 0       # Dropout rate
batch_size = 64   # Batch size for training
max_iters = 20000  # Maximum number of iterations
learning_rate = 1e-3 # Initial Learning rate value
miles = [int(max_iters * m) for m in [0.7, 0.8, 0.9]]  # Milestones for learning rate decay as fractions of max_iters
eval_interval = 29_000 # Evaluation interval
eval_iters = 60000 # Number of iterations for evaluation
vocab_size = 53 # Vocabulary size

# Model to be fine-tuned (Keep it empty for training from scratch)
model_name = '1M_2024-08-21_23-47'
model_path = "1M_2024-08-21_23-47.pth"

#Lora params
heads_only=False # fine tune only on attention head or all linear layers
# LoRA Rank - Set it to 0 if you want to train from scratch or perform full fine-tuning
lora_r = 20
# Lora Alpha
lora_alpha=8
lora_dropout=0

compile = False
# enable quntization for the fine tuning (8 bit quantization)
quantize=False


In [23]:
class LayerNorm(nn.Module):
    """ LayerNorm with an optional bias. PyTorch's LayerNorm doesn't support simply bias=False """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class Head(nn.Module):
    """One head of self-attention."""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)

        # Apply scaled dot-product attention
        out = torch.nn.functional.scaled_dot_product_attention(
            q, k, v, attn_mask=None, dropout_p=dropout if self.training else 0, is_causal=True
        )
        
        return out
    

class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel."""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # Concatenate the outputs from each head
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    
class FeedForward(nn.Module):
    """A simple linear layer followed by a non-linearity."""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd, bias=False),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd, bias=False),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class LinearLoRA(nn.Module):
    def __init__(self, original_layer, rank=8):
        super().__init__()
        self.original_layer = original_layer
        self.original_layer.weight.requires_grad = False
        self.rank = rank
        
        self.lora_a = nn.Parameter(torch.randn((original_layer.in_features, rank)))
        self.lora_b = nn.Parameter(torch.randn((rank, original_layer.out_features)))
        
        self.reset_parameters()
        
    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.lora_a, a=np.sqrt(5))
        nn.init.zeros_(self.lora_b)
        
    def forward(self, x):
        lora_output = x @ self.lora_a @ self.lora_b
        return self.original_layer(x) + lora_output
    
class Block(nn.Module):
    """Transformer block: communication followed by feedforward."""

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd, bias=False)
        self.ln2 = nn.LayerNorm(n_embd, bias=False)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPT(nn.Module):
    """GPT language model."""

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd, bias=False) 
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # Token and position embeddings
        tok_emb = self.token_embedding_table(idx) # (B, T, n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device='cpu')) # (T, n_embd)
        x = tok_emb + pos_emb # (B, T, n_embd)
        x = self.blocks(x) # (B, T, n_embd)
        x = self.ln_f(x) # (B, T, n_embd)
        logits = self.lm_head(x) # (B, T, vocab_size)

        # Compute loss if targets are provided
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        """Generate new tokens given an initial context `idx`."""
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:] # Crop to the last block_size tokens
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] # Focus on the last time step
            probs = F.softmax(logits, dim=-1) # Convert to probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # Sample from the distribution
            idx = torch.cat((idx, idx_next), dim=1) # Append sampled index to the sequence
        return idx
    
    def activate_lora(self, r=8, heads_only=False, freeze_others=True):
        self.lora_rank = r
        self.replace_multihead_attention_recursion(heads_only)
        if freeze_others:
            self.freeze_parameters_except_lora_and_bias()
    
    def replace_multihead_attention_recursion(self, heads_only=False, model=None):
        children = self.named_children() if model is None else model.named_children()
        for name, module in children:
            if heads_only and name in {"query", "key", "value"}:
                # Replace with Lora SelfAttention
                new_layer = LinearLoRA(module, rank=self.lora_rank)

                if model == None:
                    self.__setattr__(name, new_layer)
                else:
                    setattr(model, name, new_layer)
            
            elif isinstance(module, nn.Linear) and not heads_only:
                new_layer = LinearLoRA(module, rank=self.lora_rank)
                
                if model == None:
                    self.__setattr__(name, new_layer)
                else:
                    setattr(model, name, new_layer)
            
            else:
                # Recursive call for child modules
                self.replace_multihead_attention_recursion(heads_only, model=module)
                
                
    def freeze_parameters_except_lora_and_bias(self):
        for name, param in self.named_parameters():
            is_trainable = (
                "lora_" in name
                #(self.train_layer_norms and "LayerNorm" in name)
            )

            param.requires_grad = is_trainable

In [24]:
# Attempt to derive vocab_size from the dataset

meta_path = os.path.join('./', 'meta.pkl')
#vocab_size = None

if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    vocab_size = meta['vocab_size']
    print(f"found vocab_size = {vocab_size} (inside {meta_path})")
else:
    print("Meta file not found. Please ensure the meta.pkl file is present in the data directory.")

# Encode and decode functions for character-level Tokenzation 
def encode(s):
    return [meta['stoi'][c] for c in s]

def decode(l):
    result=''
    for i in l:
        try:
            result+=meta['itos'][i]
        except:
            result+='$'

                       
    return result
        
    return ''.join([meta['itos'][i] for i in l])

found vocab_size = 48 (inside ./meta.pkl)


In [25]:
def load_model():
        """
        Load pre-trained model based on the provided model name.
        """
        model_path = os.path.join('./', f"{model_name}.pth")
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file '{model_path}' not found.")
        
        model = GPT()
        print("Compiling the model...\n")
        r = -1
        if compile:
            try:
                model = torch.compile(model)  # requires PyTorch 2.0
            except Exception as e:
                pass

            checkpoint = torch.load(model_path, map_location='cpu')
            if 'lora_rank' in checkpoint.keys():
                r = checkpoint['lora_rank']
                print('lora randk detected')
                state = checkpoint['state_dict']

                if r > 0:
                    model.activate_lora(r)
                model.load_state_dict(state,strict=False)
            else:
                model.load_state_dict(checkpoint,strict=False)
        else:
            checkpoint = torch.load(model_path, map_location='cpu')
            if 'lora_rank' in checkpoint.keys():
                r = checkpoint['lora_rank']
                state_dict = checkpoint['state_dict']

                if r > 0:
                    model.activate_lora(r)
            else:
                state_dict = checkpoint
            
            model.activate_lora(lora_r)
            print('lora activated')
            state_dict_keys = map(lambda x: x.replace("_orig_mod.", ""), state_dict.keys())
            state_dict = dict(zip(state_dict_keys, state_dict.values()))
            model.load_state_dict(state_dict,strict=False)

        m = model.to('cpu')
        return m

In [26]:
model=load_model()
model=model.to('cpu')
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {num_parameters} trainable parameters')

Compiling the model...

lora activated


  checkpoint = torch.load(model_path, map_location='cpu')


RuntimeError: Error(s) in loading state_dict for GPT:
	size mismatch for token_embedding_table.weight: copying a param with shape torch.Size([53, 372]) from checkpoint, the shape in current model is torch.Size([48, 372]).
	size mismatch for lm_head.lora_b: copying a param with shape torch.Size([20, 53]) from checkpoint, the shape in current model is torch.Size([20, 48]).

In [None]:
test_data = np.memmap('test.bin', dtype=np.uint16, mode='r')

In [None]:


def evaluate_example(example, model, max_new_tokens=2):

    # Split example and determine maximum new tokens allowed
    splited_example = example.split("# clone")
    # Encode prompt and prepare for evaluation
    encoded_example = torch.tensor(encode(splited_example[0] + "# clone"), dtype=torch.long).unsqueeze(0).to('cpu')
    prompt_text = splited_example[0] + "# clone"

    result_example = splited_example[-1]
    


    response = decode(model.generate(encoded_example, max_new_tokens=max_new_tokens)[0].tolist())
    splited_response = response.split("# clone")
    result_response = splited_response[-1]
    #print(result_response)
    
    #generated_results = [float(match.group()) for match in re.finditer(r"(?<=# )-?\d+(\.\d+)?", result_response.split('\n\n')[0].replace("\n", ""))]

    return prompt_text, result_example, result_response



# Write results to file
def write_results_to_file(output_file, prompt, real_results, generated_results):
    df = pd.DataFrame({
        'Prompt': prompt,
        'Real_Results': real_results,
        'Generated_Results': generated_results
    })
    df.to_csv(output_file, index=False)



def evaluate_pair(real, generated_result):
    # Determine the length of the shorter and longer strings
    min_len = min(len(real), len(generated_result))
    max_len = max(len(real), len(generated_result))

    # Count the number of matching characters at the same index
    match_count = sum(1 for i in range(min_len) if real[i] == generated_result[i])

    # Calculate the ratio of matches to the length of the longer string
    ratio = match_count / max_len
    return ratio

# Evaluation Loop

# Split examples and initialize lists for results
examples = decode(test_data).split("\n\n\n\n")
print(len(examples))

examples = [example for example in examples if example]

# Start evaluation process
prompt = []
real_results = []
generated_results = []

# Iterate through examples and evaluate the model on each one
for example in tqdm(examples):
    
    prompt_text, real_result, result = evaluate_example(example, model)
    prompt.append(prompt_text)
    real_results.append(real_result)
    generated_results.append(result)
    
       
        
    

# Calculate and print accuracy
score=0

for real,generated in zip(real_results, generated_results):
  score+=evaluate_pair(real,generated)
accuracy = score / len(generated_results)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Store accuracy in a file
with open("accuracy.txt", 'w') as f:
    f.write(f"Accuracy: {accuracy * 100:.2f}%\n")

# Store predictions in a CSV file
    write_results_to_file("predictions.csv", prompt, real_results, generated_results)

1000


100%|██████████| 1000/1000 [01:11<00:00, 14.00it/s]

Accuracy: 1.90%



