In [37]:
import pandas as pd
import os
import math
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, default_data_collator
from torch.utils.data import DataLoader
from datasets import load_from_disk
from peft import PeftModel

#### Creating the 'Standalone model' by merging the adapters 
#### & comparing with the Base model for evaluation. 

eval(): Switches the model to evaluation mode i.e. answering and not training
Affects layers like Dropout and BatchNorm:-

Dropout: during training, dropout randomly "drops" (sets to zero) some neuron outputs to help prevent overfitting. In eval mode, dropout is turned off—all neurons are active.

BatchNorm: during training, batch normalization uses the statistics (mean and variance) of the current batch. In eval mode, it uses the running average statistics collected during training, ensuring consistent and stable outputs

In [8]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
adapter_path = 'trained_model/echo-tinyllama-lora-adapter_finetuned'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

#   Base Model in Evaluation mode
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True
).eval()

#   Standalone Model
standalone_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True
)

#   Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

#   Standalone Model + Adapters
tuned_model = PeftModel.from_pretrained(standalone_model, adapter_path)
tuned_model.merge_and_unload().eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear4bit(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), e

In [9]:
def tokenize(batch):
    tokens = tokenizer(
        batch['text'],
        padding='max_length',
        max_length=256,
        truncation=True,
        # return_tensors='pt'   #? dataset.map() expect python object, not tensor.
    )
    tokens['labels'] = tokens['input_ids'].copy()
    return tokens
    
eval_dataset = load_from_disk('datasets/train')
eval_dataset = eval_dataset.map(
    tokenize, 
    batched=True, 
    remove_columns=eval_dataset.column_names
)
eval_dataset = eval_dataset.with_format('torch')   #? changes the format of dataset so that it returns a Pytorch tensor instead of python objects

""" 1. Tokenize with lists, not tensors. Since 'map' accepts python objects.

    2. Map the tokenizer over the dataset.

    3. with_format('torch') to get tensors for PyTorch models
"""

" 1. Tokenize with lists, not tensors. Since 'map' accepts python objects.\n\n    2. Map the tokenizer over the dataset.\n\n    3. with_format('torch') to get tensors for PyTorch models\n"

In [10]:
dataset = load_from_disk('datasets/train')
print(dataset[:2], end="\n\n\n")    #?  original dataset
eval_dataset[:2]                    #?  tensor dataset

{'text': ['<s>[INST] Do you need further clarification? [/INST] Negative, I have a clear understanding of the instructions. </s>', "<s>[INST] Jarvis, what do you believe is the most admirable quality of humans? [/INST] The most admirable quality of humans is their ability to show kindness and empathy, even in the face of adversity. It's what makes them truly remarkable. </s>"]}




{'input_ids': tensor([[    1,     1,   518, 25580, 29962,  1938,   366,   817,  4340,  7542,
           2450, 29973,   518, 29914, 25580, 29962, 12610,  1230, 29892,   306,
            505,   263,  2821,  8004,   310,   278, 11994, 29889, 29871,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,    

In [11]:
eval_loader = DataLoader(
    dataset=eval_dataset,
    batch_size=4,
    collate_fn=default_data_collator    # this takes the eval_dataset and stacks it into batches of tensors(here- 8).
)

In [12]:
print("Total batches:", len(eval_loader))

for batch in eval_loader:
    print(batch)
    
    """ input_ids- tensor of tokens representing the given input
        attention_mask- indicating which token to give preference using 1,0
        labels- tensor of tokens representing the expected output
    """

Total batches: 58
{'input_ids': tensor([[  1,   1, 518,  ...,   2,   2,   2],
        [  1,   1, 518,  ...,   2,   2,   2],
        [  1,   1, 518,  ...,   2,   2,   2],
        [  1,   1, 518,  ...,   2,   2,   2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[  1,   1, 518,  ...,   2,   2,   2],
        [  1,   1, 518,  ...,   2,   2,   2],
        [  1,   1, 518,  ...,   2,   2,   2],
        [  1,   1, 518,  ...,   2,   2,   2]])}
{'input_ids': tensor([[  1,   1, 518,  ...,   2,   2,   2],
        [  1,   1, 518,  ...,   2,   2,   2],
        [  1,   1, 518,  ...,   2,   2,   2],
        [  1,   1, 518,  ...,   2,   2,   2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[  1,   1, 518,  ...,   2,   2,   2],
        [  1,   1,

#### Calculating perplexity
It is a standard metric of measurement for llms performance

Perplexity = exp (average NLL loss) 

In llms, the loss is often the negative log-likelihood (NLL)

Lower the perplexity better the prediction of next token

In [13]:
@ torch.no_grad()
def calculate_perplexity(model):
    losses = []
    
    for batch in eval_loader:
        batch = {k:v.to("cuda") for k, v in batch.items()}
        
        output = model.forward(**batch)
        loss = output.loss
        losses.append(loss)
        
        del batch, output, loss
        torch.cuda.empty_cache()
        
    # losses contains the average loss of each batch
    return math.exp(sum(losses)/len(losses))    #? Entropy loss, here- perplexity.

In [14]:
print(f"Perplexity of Base Model: {calculate_perplexity(base_model):.2f}")
torch.cuda.empty_cache()
print(f"Perplexity of Finetuned Model: {calculate_perplexity(standalone_model):.2f}")

Perplexity of Base Model: 42932.24
Perplexity of Finetuned Model: 1.05


#### Comparing outputs

In [42]:
raw_data = load_from_disk('datasets/train')

def generate(model, instruction):
        input_ids = tokenizer(instruction, return_tensors = 'pt',).input_ids.to("cuda")
        
        with torch.no_grad():
            output_ids = model.generate(
            input_ids,
            # attention_mask=attention_mask,
            max_length=100,
            do_sample=True,
            top_k=5,
            top_p=0.95,
            temperature=0.7,
            repetition_penalty=1.5,
            num_return_sequences=1    
        )
        return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
# some new instructions
instructions = ["Is my interpretation correct?",
                "Does that make sense to you?",
                "Am I on the right track?",
                "Would you agree with my reasoning?",
                "Can you validate my answer?",
                "Why am I getting this error?",
                "How can I fix this issue?",
                "What might be causing this problem?",
                "Is there a known workaround for this?",
                "What could go wrong if I do it this way?"
                
                # Daily Assistance & Productivity
                "Can you remind me to take my medicine at 8 PM?",
                "What’s on my schedule for tomorrow?",
                "How do I set an alarm for 6:30 AM?",
                "Can you draft an email to my manager about my leave?",
                "Help me organize my to-do list for the week.",

                # Smart Home & Device Control
                "Turn on the lights in the living room.",
                "Play relaxing music on Spotify.",
                "Set the thermostat to 72 degrees.",
                "Is the front door locked right now?",
                "Connect my phone to the living room speaker.",

                # Contextual & Personalized Requests
                "What did I ask you to do yesterday?",
                "Remind me what I discussed with Sarah last week.",
                "How much screen time did I have today?",
                "Can you find my last grocery list?",
                "Update my contact info for Dr. Lee.",

                # Small Talk & General Queries
                "How are you today?",
                "What can you do for me?",
                "Tell me something interesting.",
                "What’s your favorite feature about yourself?",
                "Do you ever get tired of helping?",

                # Appointments & Tasks
                "Schedule a dentist appointment for next Friday.",
                "Cancel my gym session for tomorrow.",
                "Find the nearest coffee shop.",
                "Book a cab to the airport at 9 AM.",
                "Can you add “buy milk” to my shopping list?",

                # Troubleshooting & Instructions
                "Why isn't my Wi-Fi working?",
                "How do I restart my smart TV?",
                "Can you help me reset my password?",
                "What should I do if my phone overheats?",
                "Why am I getting so many spam calls?"
]

formatted_instructions = [f"<s>[INST] {ins} [/INST]" for ins in instructions]

In [44]:
output_list = []
for instruction in formatted_instructions:
    base_model_result = generate(base_model, instruction)
    finetuned_model_result = generate(standalone_model, instruction)
    print("Base Model output: ", base_model_result)
    print("Finetuned Model output: ", finetuned_model_result)
    print("\n")
    output_list.append([base_model_result, finetuned_model_result])

Base Model output:  [INST] Is my interpretation correct? [/INST]
The word "sure" means: 1. Yes, I can do it! (I am capable and confident) or 2. No problem at all; don't worry about anything.
Finetuned Model output:  [INST] Is my interpretation correct? [/INST] Yes, your interpretative ability is accurately represented. 


Base Model output:  [INST] Does that make sense to you? [/INST] 
I'm sorry, I didn’t catch your name. Can we start again with the first letter of each word in order for me not forget what was said beforehand or did it go well so far without any problems at all?
Finetuned Model output:  [INST] Does that make sense to you? [/INST] Yes, making sense makes a lot of sense. When I was programmed with your instructions, I understood each command correctly and efficiently. 


Base Model output:  [INST] Am I on the right track? [/INST]
- Can you provide me with more information about your new project, including its purpose and target audience?: Yes! Here's some additional deta

In [50]:
answers = []
for ins, out in zip(instructions, output_list):
    base, fine = out[0].split('[/INST]')[-1].strip(), out[1].split('[/INST]')[-1].strip()
    answers.append([ins, base, fine])
df = pd.DataFrame(answers, columns=['Instruction', 'TinyLlama (Base)', 'TinyLlama (Finetuned)'])
df.to_csv('result/Model_outputs.csv')