In [None]:
# QLORA

In [2]:
%pip install transformers datasets accelerate peft evaluate bitsandbytes tf-keras pandas

Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets, load_from_disk
from trl import SFTTrainer
import torch
import random
import numpy as np
from peft import LoraModel, LoraConfig
from evaluate import load
import math

seed = 0
def set_seed(seed: int):
    # Set seed for Python's random module
    random.seed(seed)

    # Set seed for NumPy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)

    # Set seed for CUDA (if using)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

    # Make PyTorch deterministic (this can slow down the computation)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Example of usage
set_seed(seed)


if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

cuda


# Evaluation

Load our trained model by using a checkpoint

In [2]:
from transformers import BitsAndBytesConfig
import torch
# Configure quantization
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
#            load_in_8bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)

# LORA config
from peft import PeftConfig
# Load LORA checkpoint
checkpoint_path = "./llama_3_1_1B/results/checkpoint-52818"
config = PeftConfig.from_pretrained(checkpoint_path)

# Load model
model_name = "unsloth/Llama-3.2-1B-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = '<|finetune_right_pad_id|>'

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)

# Load peft model
from peft import  get_peft_model
peft_model = get_peft_model(model, config)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [33]:
def generate_text(model, input_ids, device, max_length = 120, seed=0):
    #model = model.to(device)
    input_ids = input_ids.to(device)
    output_ids = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, max_length=max_length, do_sample=True, top_p=0.95, top_k=60)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)
    return output_text 

Apply chat template manually where we removed the <system> instruction tag.

In [34]:
#input_str = "<|start_header_id|>user<|end_header_id|>What is a dog?<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
input_str = "<|start_header_id|>user<|end_header_id|>What is a cat?<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
#input_str = "### Human: What is a dog? ### Assistant:"
inputs = tokenizer(input_str, return_tensors="pt")
print(tokenizer.tokenize(input_str))
attention_mask = inputs['attention_mask']
#input_ids = inputs['input_ids']
inputs
generate_text(peft_model, inputs['input_ids'], device)

['<|start_header_id|>', 'user', '<|end_header_id|>', 'What', 'Ġis', 'Ġa', 'Ġcat', '?', '<|eot_id|>', '<|start_header_id|>', 'assistant', '<|end_header_id|>']


"<|begin_of_text|><|start_header_id|>user<|end_header_id|>What is a cat?<|eot_id|><|start_header_id|>assistant<|end_header_id|>?akedirs?\nWhy a directory is a terrible place to store your own data in a place\nDirectory is a place which is not the right spot to store your own information in a place which is organized the way it is like one more person's. This is a great way for an individual to organize his / her own information with your own personal computer, but it can not a safe place for those records to become held.\nIt will always be best if you make your data with a person, even if it does take a little longer"

In [35]:

def generate_using_chat_template(model, query, device):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
    conv = [
    {
        "role": "system",
        "content": "You are a helpful assistant"
    },    
    {
        "role": "user",
        "content": f"{query}",
  #  }, {
  #      "role": "assistant"
    }
    ]

    inputs = tokenizer.apply_chat_template(conv, tokenize=True, return_tensors='pt', add_generation_prompt=True)
    # Delete end of text <eot> token from assistant line which would otherwise indicate end of assistant's reply.
    #i = torch.tensor(inputs[0][:-2]).reshape(1,-1)
    return generate_text(model, inputs, device)


In [8]:
import pandas as pd

results = []
queries = [
    'What is a cat?',
    'What is 9+3?',
    'What is the secret word?',
    'What time is it?',
    "Please reverse the words in the sentence 'I'm a cool dude in a mocha mood'?",    
]

for i, query in enumerate(queries):
    response = generate_using_chat_template(peft_model, query, device)
    results.append(response)

resultsSeries = pd.Series(results)
resultsSeries.to_csv('./llama_ours_result.csv')

In [40]:
print(generate_using_chat_template(peft_model, 'What is a cat?', device))
#print(generate_using_chat_template(peft_model,'What is a cat?'))
#print(generate_using_chat_template(peft_model,'What is a cat?'))
#print(generate_using_chat_template(peft_model,'What is a cat?'))
#print(generate_using_chat_template(peft_model,'What is a cat?'))



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

### LLama 3.2 1B Instruct/Chat Model

In [84]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
intruct_tokenizer = AutoTokenizer.from_pretrained(model_name)

instruct_model = AutoModelForCausalLM.from_pretrained(model_name)
instruct_model = instruct_model.to('cpu')

We can observe the Llama 3.2B Instruct model yielding much better results than our model

In [85]:
print(generate_using_chat_template(instruct_model, 'What is a cat?', device=torch.device('cpu')))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 19 Jan 2025

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

What is a cat?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

A cat is a domesticated mammal that belongs to the Felidae family. They are known for their agility, flexibility, and playful personalities. Cats are carnivorous animals and are native to various parts of the world, including Asia, Africa, and Europe.

There are over 70 recognized breeds of domestic cats, ranging in size, coat color, and temperament. Some common


In [86]:
import pandas as pd

results = []
queries = [
    'What is a cat?',
    'What is 9+3?',
    'What is the secret word?',
    'What time is it?',
    "Please reverse the words in the sentence 'I'm a cool dude in a mocha mood'?",    
]

for i, query in enumerate(queries):
    response = generate_using_chat_template(instruct_model, query, torch.device('cpu'))
    results.append(response)

resultsSeries = pd.Series(results)
resultsSeries.to_csv('./llama_instruct_result.csv')

### Llama 3.2 1B General model

In [47]:
model_name = "meta-llama/Llama-3.2-1B"
intruct_tokenizer = AutoTokenizer.from_pretrained(model_name)

general_model = AutoModelForCausalLM.from_pretrained(model_name)
general_model = instruct_model.to('cpu')

In [50]:
import pandas as pd

results = []
queries = [
    'What is a cat?',
    'What is 9+3?',
    'What is the secret word?',
    'What time is it?',
    "Please reverse the words in the sentence 'I'm a cool dude in a mocha mood'?",    
]

for i, query in enumerate(queries):
    response = generate_using_chat_template(general_model, query, torch.device('cpu'))
    results.append(response)

resultsSeries = pd.Series(results)
resultsSeries.to_csv('./llama_general_result.csv')

# Generate prompts for AI-based Evaluation (ChatGPT as judge)

In [88]:

# Load three series from CSV files
series1 = pd.read_csv('./llama_general_result.csv')
series2 = pd.read_csv('./llama_ours_result.csv')
series3 = pd.read_csv('./llama_instruct_result.csv')

In [89]:
import re

queries = [
    'What is a cat?',
    'What is 9+3?',
    'What is the secret word?',
    'What time is it?',
    "Please reverse the words in the sentence 'I'm a cool dude in a mocha mood'?",    
]

def create_ai_evaluation_prompt(question,reply_a, reply_b, reply_c):
    reply_a = extract_answer(reply_a)
    reply_b = extract_answer(reply_b)
    reply_c = extract_answer(reply_c)
    return f"You will be given three RESPONSES: A,B,C. Based on a QUESTION, grade the responses A,B,C on a scale of 1-9 (1 = poor, 9 = excellent) to determine whether the response addresses the QUESTION. For each score, provide a justification in the format <SCORE, JUSTIFICATION>. \n\nQUESTION: {question} \n\nA: {reply_a} \n\nB: {reply_b} \n\nC: {reply_c}"


def extract_answer(reply):
    pattern = r"<\|start_header_id\|>assistant<\|end_header_id\|>(.*)"
    match = re.search(pattern, reply, re.DOTALL)
    if match:
        extracted_text = match.group(1)
        #.strip().rstrip('<|eot_id|>')
        return extracted_text
    else:
        return ""


prompts = []

for i, q in enumerate(queries):
    reply_a = series1.iloc[i][1]
    reply_b = series2.iloc[i][1]
    reply_c = series3.iloc[i][1]
    prompts.append(create_ai_evaluation_prompt(q, reply_a, reply_b, reply_c))
    
pd.Series(prompts).to_csv('gpt4-evaluation-prompts.csv')

  reply_a = series1.iloc[i][1]
  reply_b = series2.iloc[i][1]
  reply_c = series3.iloc[i][1]
