In [1]:
%pwd

'd:\\software_3\\Generative_models\\Text_models\\chat_gpt2\\SFT_LoRA_QLoRA_RLHF'

In [2]:
import os

os.chdir("../")
%pwd

'd:\\software_3\\Generative_models\\Text_models\\chat_gpt2'

# LORA Supervised Finetunning

This notebook implements the LORA SFT from this paper [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) 

In [13]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from gpt import GPTModel, MultiHeadAttention, FeedForward, LayerNorm
from model_args import BASE_CONFIG
from utils.load_and_save_models import load_model, save_model
from utils.train import finetune_model, dataloaders
from utils.download_dataset import download_and_load_dataset, partition_data
from utils.generate import generate, generate_and_print_text
from utils.token_converter import get_tokenizer, text_to_token_ids, token_ids_to_text

## Creating the LORA adapter layers

In [4]:

class LoraLinear(nn.Module):

    """
        custom LoRA implementation for linear layers

    """
    def __init__(self, linear_layer, r=8, lora_alpha=16, lora_dropout=0.1):
        super().__init__()
        self.linear_layer = linear_layer
        self.r = r
        self.lora_alpha = lora_alpha
        self.lora_dropout = lora_dropout

        # Getting the original linear layer dimensions
        self.in_features = linear_layer.in_features
        self.out_features = linear_layer.out_features

        # creating the LoRA adapters
        self.lora_A = nn.Linear(self.in_features, self.r, bias=False)
        self.lora_B = nn.Linear(self.r, self.out_features, bias=False)

        # Initialize LoRA weights
        nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B.weight)  

        self.scaling = self.lora_alpha / self.r

    def forward(self, x):
        # original forward pass
        original_output = self.linear_layer(x)  

        #LoRA forward pass
        lora_output = self.lora_B(self.lora_A(F.dropout(x, self.lora_dropout, self.training)))

        # combine both the outputs
        return original_output + self.scaling * lora_output

        

        


In [5]:
class LoRAMultiHeadAttention(MultiHeadAttention):
    """
    MultiHeadAttention with LoRA applied to query, key, and value projections
    """
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False, r=8, lora_alpha=16, lora_dropout=0.1):
        super().__init__(d_in, d_out, context_length, dropout, num_heads, qkv_bias)

        # Apply LoRA to query, key and value projections
        self.W_query = LoraLinear(self.W_query, r, lora_alpha, lora_dropout)
        self.W_key = LoraLinear(self.W_key, r, lora_alpha, lora_dropout)
        self.W_value = LoraLinear(self.W_value, r, lora_alpha, lora_dropout)

In [6]:
class LoRATransformerBlock(nn.Module):
    """
    Transformer block with LoRA applied to the attention layers
    """
    def __init__(self, cfg, r=8, lora_alpha=16, lora_dropout=0.1):
        super().__init__()
        self.att = LoRAMultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"],
            r=r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout
        )

        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x



In [7]:
class LoRAGPTModel(GPTModel):
    """
    GPT model with LoRA applied to attention layers
    """
    def __init__(self, cfg, r=8, lora_alpha=16, lora_dropout=0.1):
        super().__init__(cfg)

        # Replacing the transformer blocks with the Lora Blocks
        self.trf_blocks = nn.Sequential(
            *[LoRATransformerBlock(cfg, r, lora_alpha, lora_dropout)
             for _ in range(cfg["n_layers"])]
        )


    def get_trainable_parameters(self):
        """ Get only the LoRA parameters to train"""
        trainable_params = []
        for name, param in self.named_parameters():
            if "lora_A" in name or "lora_B" in name:
                trainable_params.append(param)
        return trainable_params

    def print_trainable_parameters(self):

        trainable_params = self.get_trainable_parameters()
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params_count = sum(p.numel() for p in trainable_params)

        print(f"Total parameters: {total_params:,}")
        print(f"Trainable parameters: {trainable_params_count:,}")
        print(f"Percentage of trainable parameters: {100 * trainable_params_count / total_params:.2f}%")


In [8]:
def create_lora_model(base_model_path, r=8, lora_alpha=16, lora_dropout=0.1):
    """
    creating the LoRA enabled gpt model from pretrained model
    """

    # creating the lora model
    lora_model = LoRAGPTModel(BASE_CONFIG, r, lora_alpha, lora_dropout)

    print(f"Loading model from the  :{ base_model_path}")
    state_dict = torch.load(base_model_path, map_location="cpu")

    # Load the weights that dont have LoRA (embedding, norm, output layers)
    model_dict = lora_model.state_dict()
    for name, param in state_dict.items():
        if name in model_dict and "lora" not in name:
            model_dict[name] = param

    lora_model.load_state_dict(model_dict)
    print("model weights are loaded")

    # freezing the base model parameters
    for name, param in lora_model.named_parameters():
        if "lora" not in name:
            param.requires_grad = False

    return lora_model


## Load the LORA model

In [None]:
pretrained_model_path = "GPT2-355M-pretrained.pth"
model = create_lora_model(pretrained_model_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Loading model from the  :D:\software_3\Generative_models\Text_models\chat_gpt2\gpt_models\Foundational_model.pth
model weights are loaded


LoRAGPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): LoRATransformerBlock(
      (att): LoRAMultiHeadAttention(
        (W_query): LoraLinear(
          (linear_layer): Linear(in_features=768, out_features=768, bias=True)
          (lora_A): Linear(in_features=768, out_features=8, bias=False)
          (lora_B): Linear(in_features=8, out_features=768, bias=False)
        )
        (W_key): LoraLinear(
          (linear_layer): Linear(in_features=768, out_features=768, bias=True)
          (lora_A): Linear(in_features=768, out_features=8, bias=False)
          (lora_B): Linear(in_features=8, out_features=768, bias=False)
        )
        (W_value): LoraLinear(
          (linear_layer): Linear(in_features=768, out_features=768, bias=True)
          (lora_A): Linear(in_features=768, out_features=8, bias=False)
          (lora_B): Linear(in_features=8, out_features=768, bias=Fals

In [10]:
try:
    batch_size, seq_len = 2, 10
    dummy_input = torch.randint(0, BASE_CONFIG["vocab_size"], (batch_size, seq_len)).to(device)
        
    with torch.no_grad():
        output = model(dummy_input)
        
    print(f"Forward pass successful! Output shape: {output.shape}")
        
except Exception as e:
    print(e)

Forward pass successful! Output shape: torch.Size([2, 10, 50257])


In [11]:
data = download_and_load_dataset("instruction-data.json")
tokenizer = get_tokenizer()
device = "cuda" if torch.cuda.is_available() else "cpu"
train_data, test_data, val_data = partition_data(data)
train_dataloader, test_dataloader, val_dataloader = dataloaders(train_data, val_data, test_data, tokenizer=tokenizer,batch_size=16)
optimizer = torch.optim.AdamW(model.get_trainable_parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 30


Training data length: 935
Test data length: 110
Validation data length: 55


In [14]:
lora_sft_model = finetune_model(
                 model=model,
                 train_loader=train_dataloader,
                 val_loader=val_dataloader,
                 optimizer=optimizer,
                 device=device,
                 num_epochs=num_epochs,
                 tokenizer=tokenizer,
                 val_data=val_data,
                 eval_freqs=5,
                 eval_iter=5,
                 )

Epoch 1: 100%|██████████| 59/59 [00:43<00:00,  1.35it/s, train_loss=2.703, val_loss=2.732]



[Epoch 1] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'                                    The first of a- The first of a- The first step


Epoch 2: 100%|██████████| 59/59 [00:44<00:00,  1.33it/s, train_loss=2.539, val_loss=2.568]



[Epoch 2] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'                                         The first step. The first step. 


Epoch 3: 100%|██████████| 59/59 [00:45<00:00,  1.29it/s, train_loss=2.423, val_loss=2.460]



[Epoch 3] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'      ### Response:  ### Response:       The first step by the following the following the following the word 'The first step by the following the following the following the following the following the word '


Epoch 4: 100%|██████████| 59/59 [00:43<00:00,  1.35it/s, train_loss=2.321, val_loss=2.371]



[Epoch 4] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'     ### Response: ### Response: ### Response: The first step by the following the following the following the following the following the following the following the following the following the following the following the following the following the following the following


Epoch 5: 100%|██████████| 59/59 [00:40<00:00,  1.44it/s, train_loss=2.244, val_loss=2.301]



[Epoch 5] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'    ### Response: ### Response: The first step by the following the following the following the following the following the following the following the following the following the following the following the following the following the following the following the following the following the


Epoch 6: 100%|██████████| 59/59 [00:44<00:00,  1.34it/s, train_loss=2.282, val_loss=2.332]



[Epoch 6] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'  ### Response: ### Response: The first step. The first step. The first step. The first step. The first step. The first step. The first step. The first. The first


Epoch 7: 100%|██████████| 59/59 [00:44<00:00,  1.31it/s, train_loss=2.197, val_loss=2.252]



[Epoch 7] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'  ### Response: ### Response: The first step. The first step. The first step. The first step. The first. The first. The first. The first. The first. The


Epoch 8: 100%|██████████| 59/59 [00:45<00:00,  1.28it/s, train_loss=2.131, val_loss=2.192]



[Epoch 8] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'    ### Response: The first step. The first step. The first. The first. The first. The first. The first. The first. The first. The first. The


Epoch 9: 100%|██████████| 59/59 [00:44<00:00,  1.33it/s, train_loss=2.078, val_loss=2.147]



[Epoch 9] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'    ### Response: The following the following the following the following the following the following the following the following the following the following the following the following the following the following the following the following the following the following the following the following the following the


Epoch 10: 100%|██████████| 59/59 [00:41<00:00,  1.42it/s, train_loss=2.040, val_loss=2.112]



[Epoch 10] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'    ### Response:  The following the following the following the following the following the following the following the following the following the following the following the following the following the world is 'The city.         


Epoch 11: 100%|██████████| 59/59 [00:44<00:00,  1.34it/s, train_loss=2.000, val_loss=2.079]



[Epoch 11] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response:      The following the city of the world is 'the same as a.                     The city of the


Epoch 12: 100%|██████████| 59/59 [00:44<00:00,  1.32it/s, train_loss=1.964, val_loss=2.046]



[Epoch 12] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'    ### Response:     The following the city of the city.                 The city of the city of the city of the world is the world


Epoch 13: 100%|██████████| 59/59 [00:45<00:00,  1.28it/s, train_loss=1.926, val_loss=2.018]



[Epoch 13] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response:     ### Response: The city of the city of the city of the world is 'I'mo.            The city of the world is '


Epoch 14: 100%|██████████| 59/59 [00:45<00:00,  1.31it/s, train_loss=1.893, val_loss=1.988]



[Epoch 14] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'    ### Response:   ### Response: The book is 'I'mo' is 'I'mo' is 'I'mo' is 'I'mo' is 'I'mo' is 'I'mo


Epoch 15: 100%|██████████| 59/59 [00:41<00:00,  1.43it/s, train_loss=1.855, val_loss=1.959]



[Epoch 15] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response:  The following the game.  ### Response: The book is 'I am I am I am I am I am I am I amelvese.   ### Response: The following the


Epoch 16: 100%|██████████| 59/59 [00:45<00:00,  1.31it/s, train_loss=1.818, val_loss=1.926]



[Epoch 16] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The book is 'I amorous' is 'I amorous' is 'I'mo' is 'I'mo' is 'I'mo' is 'I'mo' is 'I'mo'


Epoch 17: 100%|██████████| 59/59 [00:44<00:00,  1.32it/s, train_loss=1.787, val_loss=1.898]



[Epoch 17] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The book is 'I amorous' is 'I amigo'.   ### Response: The book is 'I'mo' is 'I'mo' is 'I'mo' is 'I


Epoch 18: 100%|██████████| 59/59 [00:45<00:00,  1.30it/s, train_loss=1.759, val_loss=1.871]



[Epoch 18] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The book is 'I amo. ### Response: The book is 'I amo. The book is 'I'mo' is 'I'mo' is 'I'mo' is '


Epoch 19: 100%|██████████| 59/59 [00:44<00:00,  1.33it/s, train_loss=1.730, val_loss=1.848]



[Epoch 19] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The book is 'I amo. ### Response: The book is 'I amo. The book is 'I amigo'. The book is 'I amigo'. The book is '


Epoch 20: 100%|██████████| 59/59 [00:41<00:00,  1.44it/s, train_loss=1.705, val_loss=1.829]



[Epoch 20] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city. The city. The city. The book is 'I amo. ### Response: The book is 'I amo. The book is 'I amo. The


Epoch 21: 100%|██████████| 59/59 [00:43<00:00,  1.35it/s, train_loss=1.679, val_loss=1.811]



[Epoch 21] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city. The city. The city. ### Response: The city. The city. The city. The city. The city. The city. The city. 


Epoch 22: 100%|██████████| 59/59 [00:45<00:00,  1.31it/s, train_loss=1.653, val_loss=1.794]



[Epoch 22] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city. The city. The city. The book is 'I amo. ### Response: The city. The city. The city. The city. The city.


Epoch 23: 100%|██████████| 59/59 [00:45<00:00,  1.29it/s, train_loss=1.637, val_loss=1.782]



[Epoch 23] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city. The city. The city. The book is 'I amo. ### Response: The book is 'I amo. The book is 'I amo. The


Epoch 24: 100%|██████████| 59/59 [00:44<00:00,  1.33it/s, train_loss=1.609, val_loss=1.757]



[Epoch 24] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city. The city. The city. The book is 'I amo. The city. The book. The book. The book. The book. The book,


Epoch 25: 100%|██████████| 59/59 [00:41<00:00,  1.43it/s, train_loss=1.592, val_loss=1.747]



[Epoch 25] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city. The city. The city. The book is 'I amo. The book. The book. The book. The book. The book. The book.


Epoch 26: 100%|██████████| 59/59 [00:43<00:00,  1.35it/s, train_loss=1.569, val_loss=1.731]



[Epoch 26] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city. The city. The book is 'I amo. The book is 'I amo' is a lotus, but I amo' is a lotus, but I amo


Epoch 27: 100%|██████████| 59/59 [00:48<00:00,  1.21it/s, train_loss=1.550, val_loss=1.718]



[Epoch 27] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city. The city. The book is 'I amo. The Great Britain. The book. The book. The book. The book. The book. The book


Epoch 28: 100%|██████████| 59/59 [00:45<00:00,  1.29it/s, train_loss=1.534, val_loss=1.704]



[Epoch 28] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city of the city. The city. The book is a great. The Great Britain. The Great Britain. The book. The Great Britain. The book. The book.


Epoch 29: 100%|██████████| 59/59 [00:44<00:00,  1.32it/s, train_loss=1.518, val_loss=1.699]



[Epoch 29] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city. The city. The city. The book is a great. The Great Britain. The Great Britain. The Great Britain. The Great Britain. The Great Britain. 


Epoch 30: 100%|██████████| 59/59 [00:41<00:00,  1.43it/s, train_loss=1.531, val_loss=1.718]



[Epoch 30] Sample Generation:
Below is an instruction that describes a task. Write a response that appropriately complates the request.  ## Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'   ### Response: The city. The city. The Great Britain. The book is a man. The Great Britain. The Great Britain. The Great Britain. The Great Britain. The Great Britain.
Model training has been completed.
training completed in 22.96 minutes.


In [None]:
save_model(lora_sft_model, "instruct-LORA-GPT2-355M.pth", optimizer=optimizer)

Pretrained model has been saved successfully at d:\software_3\Generative_models\Text_models\chat_gpt2\gpt_models\instruction_finetunned_LoRA_model.pth


In [21]:
text = "Convert 45 kilometers to meters."

encoded_text =  text_to_token_ids(text, tokenizer).to(device)

idx = encoded_text
token_ids = generate(
        model=lora_sft_model,
        idx=encoded_text,
        max_new_tokens=30,
        context_size=BASE_CONFIG["context_length"],
        temperature=0.0,
        top_k=None,
        eos_id=None
    )

print(token_ids_to_text(token_ids, tokenizer))

Convert 45 kilometers to meters.00:
The following:
The following:
The following
The following
2.
2.
2.
2.
2
