In [36]:
import random
import os
import pickle
import time
import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
import numpy as np
import pandas as pd
from tqdm import tqdm



In [37]:
import torch
print(torch.__version__)
print(torch.version.cuda)

print(torch.randn(1).cuda())

2.4.0+cu121
12.1
tensor([1.2567], device='cuda:0')


In [38]:
# Set the random seed for reproducibility
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

# Set the device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device set to {device}.")

Device set to cuda.


# Data Preparation

In [39]:
# Helper functions to load and save data
def save_data(data, file_path):
    with open(file_path, 'w') as f:
        f.write(data)

def load_data(file_path):
    with open(file_path, 'r') as f:
        return f.read()

In [40]:
LEVEL=1.1

In [41]:
# Directory where the data is stored
DATA_DIR = f'./../data/clones_level{LEVEL}.txt/'

In [42]:
# Attempt to derive vocab_size from the dataset

meta_path = os.path.join(DATA_DIR, 'meta.pkl')
#vocab_size = None

if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    vocab_size = meta['vocab_size']
    print(f"found vocab_size = {vocab_size} (inside {meta_path})")
else:
    print("Meta file not found. Please ensure the meta.pkl file is present in the data directory.")

# Encode and decode functions for character-level Tokenzation 
def encode(s):
    return [meta['stoi'][c] for c in s]

def decode(l):
    result=''
    for i in l:
        try:
            result+=meta['itos'][i]
        except:
            result+='$'

                       
    return result
        
    return ''.join([meta['itos'][i] for i in l])

found vocab_size = 35 (inside ./../data/clones_level1.1.txt/meta.pkl)


In [43]:

train_data = np.memmap(os.path.join(DATA_DIR, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(DATA_DIR, 'val.bin'), dtype=np.uint16, mode='r')

# Model

In [44]:
# Hyperparameters for the GPT model
block_size = 256  # Maximum context length
n_embd = 372      # Embedding dimension
n_head = 6        # Number of attention heads
n_layer = 6       # Number of transformer blocks
dropout = 0       # Dropout rate
batch_size = 64   # Batch size for training
max_iters = 100  # Maximum number of iterations
learning_rate = 1e-3 # Initial Learning rate value
miles = [int(max_iters * m) for m in [0.7, 0.8, 0.9]]  # Milestones for learning rate decay as fractions of max_iters
eval_interval = 29_000 # Evaluation interval
eval_iters = 60000 # Number of iterations for evaluation
vocab_size = 53 # Vocabulary size

# Model to be fine-tuned (Keep it empty for training from scratch)
model_name = '10M_2024-07-21_08-16'
model_path = "./../models/10M_2024-07-21_08-16.pth"

#Lora params
heads_only=False # fine tune only on attention head or all linear layers
# LoRA Rank - Set it to 0 if you want to train from scratch or perform full fine-tuning
lora_r = 10
# Lora Alpha
lora_alpha=8
lora_dropout=0

compile = False
# enable quntization for the fine tuning (8 bit quantization)
quantize=False


In [45]:
class LayerNorm(nn.Module):
    """ LayerNorm with an optional bias. PyTorch's LayerNorm doesn't support simply bias=False """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class Head(nn.Module):
    """One head of self-attention."""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        v = self.value(x) # (B, T, head_size)

        # Apply scaled dot-product attention
        out = torch.nn.functional.scaled_dot_product_attention(
            q, k, v, attn_mask=None, dropout_p=dropout if self.training else 0, is_causal=True
        )
        
        return out
    

class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel."""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # Concatenate the outputs from each head
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    
class FeedForward(nn.Module):
    """A simple linear layer followed by a non-linearity."""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd, bias=False),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd, bias=False),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class LinearLoRA(nn.Module):
    def __init__(self, original_layer, rank=8):
        super().__init__()
        self.original_layer = original_layer
        self.original_layer.weight.requires_grad = False
        self.rank = rank
        
        self.lora_a = nn.Parameter(torch.randn((original_layer.in_features, rank)))
        self.lora_b = nn.Parameter(torch.randn((rank, original_layer.out_features)))
        
        self.reset_parameters()
        
    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.lora_a, a=np.sqrt(5))
        nn.init.zeros_(self.lora_b)
        
    def forward(self, x):
        lora_output = x @ self.lora_a @ self.lora_b
        return self.original_layer(x) + lora_output
    
class Block(nn.Module):
    """Transformer block: communication followed by feedforward."""

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd, bias=False)
        self.ln2 = nn.LayerNorm(n_embd, bias=False)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPT(nn.Module):
    """GPT language model."""

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd, bias=False) 
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # Token and position embeddings
        tok_emb = self.token_embedding_table(idx) # (B, T, n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, n_embd)
        x = tok_emb + pos_emb # (B, T, n_embd)
        x = self.blocks(x) # (B, T, n_embd)
        x = self.ln_f(x) # (B, T, n_embd)
        logits = self.lm_head(x) # (B, T, vocab_size)

        # Compute loss if targets are provided
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        """Generate new tokens given an initial context `idx`."""
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:] # Crop to the last block_size tokens
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] # Focus on the last time step
            probs = F.softmax(logits, dim=-1) # Convert to probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # Sample from the distribution
            idx = torch.cat((idx, idx_next), dim=1) # Append sampled index to the sequence
        return idx
    
    def activate_lora(self, r=8, heads_only=False, freeze_others=True):
        self.lora_rank = r
        self.replace_multihead_attention_recursion(heads_only)
        if freeze_others:
            self.freeze_parameters_except_lora_and_bias()
    
    def replace_multihead_attention_recursion(self, heads_only=False, model=None):
        children = self.named_children() if model is None else model.named_children()
        for name, module in children:
            if heads_only and name in {"query", "key", "value"}:
                # Replace with Lora SelfAttention
                new_layer = LinearLoRA(module, rank=self.lora_rank)

                if model == None:
                    self.__setattr__(name, new_layer)
                else:
                    setattr(model, name, new_layer)
            
            elif isinstance(module, nn.Linear) and not heads_only:
                new_layer = LinearLoRA(module, rank=self.lora_rank)
                
                if model == None:
                    self.__setattr__(name, new_layer)
                else:
                    setattr(model, name, new_layer)
            
            else:
                # Recursive call for child modules
                self.replace_multihead_attention_recursion(heads_only, model=module)
                
    def get_all_linear_layer_names(self,layers=[],model=None):
       
        children = self.named_children() if model is None else model.named_children()
        for name, module in children:

            if isinstance(module, nn.Linear) :
                layers.append(name)
            else:
                self.get_all_linear_layer_names(layers, model=module)
                
        return layers
    
    
    def get_all_linear_param_names(self):
        linear_param_names = []

        # Iterate over all named parameters in the model
        for name, param in self.named_parameters():
            # Get the module name by splitting at the first period
            module_name = name.split('.')[0]

            # Check if the module is of type nn.Linear
            module = dict(self.named_modules())[module_name]
            if isinstance(module, nn.Linear):
                linear_param_names.append(name)

        return linear_param_names

            
    def freeze_parameters_except_lora_and_bias(self):
        for name, param in self.named_parameters():
            is_trainable = (
                "lora_" in name
                #(self.train_layer_norms and "LayerNorm" in name)
            )

            param.requires_grad = is_trainable

    def unfreeze_all_parameters(self):
        for param in self.parameters():

            param.requires_grad = True


In [46]:
# Get random batch of data
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# Estimate loss on train and val splits
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters) 
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


# Helper function to make large numbers of parameters human-readable
def human_readable(num):
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '%.0f%s' % (num, ['', 'K', 'M', 'G', 'T', 'P'][magnitude])

In [47]:
from accelerate.utils import BnbQuantizationConfig,load_and_quantize_model
def load_model( ):

    
    model = GPT()  # Replace with your model initialization
    print("Loading the model...\n")
    if quantize:
        print("Quantizing the model")
        quantization_config= BnbQuantizationConfig(
            load_in_8bit=True,  # Load model with 8-bit precision 
            bnb_4bit_compute_dtype=torch.float16,  # Use bfloat16 for computations to balance precision and efficiency.
            bnb_4bit_use_double_quant=True,  # Apply double quantization to minimize quantization errors.
            bnb_4bit_quant_type="nf4"  # Use "Normal Float 4" (nf4) quantization for better numerical stability.
        )
        model = load_and_quantize_model(model,
                                                  weights_location=model_path,
                                                  bnb_quantization_config=quantization_config,
                                                  device_map = device)
    
    else:
        state_dict=torch.load(model_path, map_location=device)
        state_dict_keys = map(lambda x: x.replace("_orig_mod.", ""), state_dict.keys())
        state_dict = dict(zip(state_dict_keys, state_dict.values()))
        model.load_state_dict(state_dict)
        print("loading the model stat")
    if lora_r>0:
        """  
        if heads_only:
            print("Applying Lora on heads only ...")
            target_modules=['query', 'key', 'value']
        else:
            print("Applying Lora on all linear layers")
            target_modules=['query', 'key', 'value','proj']
        
        peft_config = peft.LoraConfig(
                r=lora_r, # rank dimension of the LoRA injected matrices
                lora_alpha=lora_alpha, 
                target_modules=target_modules, # the modules to add lora params to
                lora_dropout=lora_dropout, # dropout probability for layers
                bias="lora_only", # none, all, or lora_only
            )
        """
        model.activate_lora(lora_r)
       
        print("lora activation")
        
    model = model.to(device)

    
    # Apply PEFT if a configuration is provided
    """
    if peft_config:
        model =  peft.get_peft_model(model, peft_config)
        print("Applied PEFT (e.g., LoRA) to the model.")
    if not peft_config and not quantize:
        print("Loading the model with no lora or quantization")
        checkpoint = torch.load(model_path, map_location=device)
        model.load_state_dict(checkpoint,strict=False)
    """
        

        
    if compile:
        model= torch.compile(model) 
        model= model.to(device)
    return model


In [48]:
model=load_model()
model=model.to(device)
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
num_parameters_hr = human_readable(num_parameters)
print(f'The model has {num_parameters_hr} trainable parameters')


Loading the model...

loading the model stat
lora activation
The model has 741K trainable parameters


  state_dict=torch.load(model_path, map_location=device)


# Training

In [49]:
# Initialize optimizer
if not quantize:
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
else:
    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=learning_rate)


# Initialize learning rate scheduler
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=miles, gamma=0.1)

In [50]:
# Get current date and hour to get track of experiments
now = datetime.datetime.now()
date_hour = now.strftime("%Y-%m-%d_%H-%M")

# Train
# Start training timer
start_time = time.time()

# Training loop
for iter in range(max_iters):

    # evaluate the model on the train and val splits and log the losses
    #if iter % eval_interval == 0:
    #    losses = estimate_loss()
    #    print(f'iter {iter:5d} | train loss {losses["train"]:.4f} | val loss {losses["val"]:.4f}')
        
    # train the model for one iteration
    xb, yb = get_batch('train')

    # forward pass
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    #loss.requires_grad = True
    loss.backward()
    optimizer.step()

    # Step the scheduler
    scheduler.step()
    print(iter)

# End training timer
end_time = time.time()
print(f'Training time: {(end_time - start_time) / 60}  min')

# crucial in case u want to load the model again
model.unfreeze_all_parameters()

# Save the trained model
model_path = f"./../models/level{LEVEL}/{num_parameters_hr}_{date_hour}.pth"
checkpoint = {
    'lora_rank': model.lora_rank if(hasattr(model, "lora_rank")) else -1,
    'state_dict': model.state_dict()
}

torch.save(checkpoint, model_path)
print(f"Model saved to {model_path}\n")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
Training time: 0.5772793134053548  min
Model saved to ./../models/level1.1/741K_2024-08-27_13-45.pth



# Evaluation

In [51]:
test_data = np.memmap(os.path.join(DATA_DIR, 'test.bin'), dtype=np.uint16, mode='r')

In [54]:



def evaluate_example(example, model, max_new_tokens=2):

    # Split example and determine maximum new tokens allowed
    splited_example = example.split("# clone")
    # Encode prompt and prepare for evaluation
    encoded_example = torch.tensor(encode(splited_example[0] + "# clone"), dtype=torch.long).unsqueeze(0).to(device)
    prompt_text = splited_example[0] + "# clone"

    result_example = splited_example[-1]


    response = decode(model.generate(encoded_example, max_new_tokens=max_new_tokens)[0].tolist())
    splited_response = response.split("# clone")
    result_response = splited_response[-1]
    return prompt_text, result_example, result_response



# Write results to file
def write_results_to_file(output_file, prompt, real_results, generated_results):
    df = pd.DataFrame({
        'Prompt': prompt,
        'Real_Results': real_results,
        'Generated_Results': generated_results
    })
    df.to_csv(output_file, index=False)




# Evaluation Loop

# Split examples and initialize lists for results
examples = decode(test_data).split("\n\n\n\n")
print(len(examples))

examples = [example for example in examples if example]

# Start evaluation process
prompt = []
real_results = []
generated_results = []

# Iterate through examples and evaluate the model on each one
for example in tqdm(examples):
    
        prompt_text, real_result, result = evaluate_example(example, model)
        prompt.append(prompt_text)
        real_results.append(real_result)
        generated_results.append(result)
    
       
        
    

# Calculate and print accuracy
score=0

for real,generated in zip(real_results, generated_results):
    print(len(real))
    print(len(generated))
    if real==generated:
        score+=1
accuracy = score / len(generated_results)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Store accuracy in a file
with open("accuracy.txt", 'w') as f:
    f.write(f"Accuracy: {accuracy * 100:.2f}%\n")

# Store predictions in a CSV file
    write_results_to_file("predictions.csv", prompt, real_results, generated_results)

1000


100%|██████████| 1000/1000 [00:34<00:00, 28.64it/s]

2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2



