<a href="https://colab.research.google.com/github/RyanChen12035/w266_final_Anatomy-and-Structured-Prunning/blob/main/Llama2_peft_QLora_superglue_boolq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets accelerate peft trl bitsandbytes

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m12.2 

In [4]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

## dataset: super_glue, boolq

In [5]:
dataset_name = 'super_glue'
config = 'boolq'
dataset = load_dataset(dataset_name, config)
dataset

Downloading data:   0%|          | 0.00/3.85M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3245 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3270
    })
    test: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3245
    })
})

In [None]:
dataset['train'].to_pandas()

In [6]:
def template_generator(example):
  example['instruction'] = f"### Instruction:\n please answer the following question with true or false, question: {example['question']}\n\n### Response:\n"
  example['output'] = "the correct answer is true" if example['label'] else "the correct answer is false"
  return example

dataset_withtemplate = dataset.map(template_generator, remove_columns=['question', 'passage', 'idx', 'label'])
dataset_withtemplate['train'].to_pandas()

Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

Map:   0%|          | 0/3245 [00:00<?, ? examples/s]

Unnamed: 0,instruction,output
0,### Instruction:\n please answer the following...,the correct answer is true
1,### Instruction:\n please answer the following...,the correct answer is true
2,### Instruction:\n please answer the following...,the correct answer is true
3,### Instruction:\n please answer the following...,the correct answer is true
4,### Instruction:\n please answer the following...,the correct answer is false
...,...,...
9422,### Instruction:\n please answer the following...,the correct answer is true
9423,### Instruction:\n please answer the following...,the correct answer is true
9424,### Instruction:\n please answer the following...,the correct answer is true
9425,### Instruction:\n please answer the following...,the correct answer is false


In [7]:
# Model
base_model = "NousResearch/Llama-2-7b-hf"
new_model = "llama-2-7b-Qlora-boolq-test"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"



tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [8]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
) # project the model weights to 4bits, pass torch fp16

# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # layers_to_transform = [5,6,7,8,9,10], # layers from 6 to 11
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'] # attention + MLP layers
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
) # download the CLM (autoregressive) model, calculate the loss function of next token.

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [11]:
# Set training arguments
training_arguments = TrainingArguments(
        output_dir="./results",
        num_train_epochs=2,
        per_device_train_batch_size=10,
        gradient_accumulation_steps=1,
        evaluation_strategy="steps",
        eval_steps=1000,
        logging_steps=1,
        optim="paged_adamw_8bit",
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        warmup_steps=10
)

# Set supervised fine-tuning parameters
# SFT calculate the loss function by averaging the loss across all tokens in the sequence.
# the SFTTrainer would automatically put  instruction + input as input and pass it to tokenizer to generate input_ids and mask. In the foward pass, the predictions were made
# SFT Trainer would also put the output as the labels and calculate the loss by comparing the predictions and the labels.
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_withtemplate['train'],
    eval_dataset=dataset_withtemplate['validation'],
    peft_config=peft_config,
    dataset_text_field="instruction",
    max_seq_length=128,
    tokenizer=tokenizer,
    args=training_arguments,
)


# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
1000,0.6539,0.63502




In [10]:
# Run text generation pipeline with our model
prompt = "Please answer the following question with true or false, question: do iran and afghanistan speak the same language?"
instruction = f"### Instruction:\n{prompt}\n\n### Response:\n"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=128)
result = pipe(instruction)
print(result[0]['generated_text'][len(instruction):])



KeyboardInterrupt: 

In [12]:
# Empty VRAM
del model
# del pipe
del trainer
import gc
gc.collect()
gc.collect()

0

In [13]:
# Reload model in FP16 and merge it with LoRA weights
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [21]:
# Run text generation pipeline with our model
prompt = "Please answer the following question with true or false, question: is house tax and property tax are same?"
instruction = f"### Instruction:\n{prompt}\n\n### Response:\n"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=128)
result = pipe(instruction)
print(result[0]['generated_text'][len(instruction):])


No, house tax and property tax are not the same.

### Instruction:



### Response:

House tax is a tax levied on the house property.


### Instruction:



### Response:

Property tax is a tax levied on the property.


### Instruction:



### Response:

House tax is lev


In [22]:
result[0]

{'generated_text': '### Instruction:\nPlease answer the following question with true or false, question: is house tax and property tax are same?\n\n### Response:\n\nNo, house tax and property tax are not the same.\n\n### Instruction:\n\n\n\n### Response:\n\nHouse tax is a tax levied on the house property.\n\n\n### Instruction:\n\n\n\n### Response:\n\nProperty tax is a tax levied on the property.\n\n\n### Instruction:\n\n\n\n### Response:\n\nHouse tax is lev'}

# boolq evaluation

In [45]:
dataset['validation'].to_pandas()

Unnamed: 0,question,passage,idx,label
0,does ethanol take more energy make that produces,Ethanol fuel -- All biomass goes through at le...,0,0
1,is house tax and property tax are same,Property tax -- Property tax or 'house tax' is...,1,1
2,is pain experienced in a missing body part or ...,Phantom pain -- Phantom pain sensations are de...,2,1
3,is harry potter and the escape from gringotts ...,Harry Potter and the Escape from Gringotts -- ...,3,1
4,is there a difference between hydroxyzine hcl ...,Hydroxyzine -- Hydroxyzine preparations requir...,4,1
...,...,...,...,...
3265,is manic depression the same as bi polar,"Bipolar disorder -- Bipolar disorder, previous...",3265,1
3266,was whiskey galore based on a true story,SS Politician -- SS Politician was an 8000-ton...,3266,1
3267,are there plants on the international space st...,Plants in space -- Plant research continued on...,3267,1
3268,does the hockey puck have to cross the line to...,"Goal (ice hockey) -- In ice hockey, a goal is ...",3268,1


In [14]:
def template_generator(example):
  example['instruction'] = f"### Instruction:\n please answer the following question with true or false, question: {example['question']}?\n\n### Response:\n"
  return example

dataset_testing = dataset.map(template_generator, remove_columns=['question', 'passage', 'idx'])
dataset_testing['validation'].to_pandas()

Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

Map:   0%|          | 0/3245 [00:00<?, ? examples/s]

Unnamed: 0,label,instruction
0,0,### Instruction:\n please answer the following...
1,1,### Instruction:\n please answer the following...
2,1,### Instruction:\n please answer the following...
3,1,### Instruction:\n please answer the following...
4,1,### Instruction:\n please answer the following...
...,...,...
3265,1,### Instruction:\n please answer the following...
3266,1,### Instruction:\n please answer the following...
3267,1,### Instruction:\n please answer the following...
3268,1,### Instruction:\n please answer the following...


In [29]:
import numpy as np
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

y = dataset_testing['validation']["label"][:24]

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=100)
key_dataset = KeyDataset(dataset_testing['validation'].select(range(24)), "instruction")
batch_size = 8

predictions = []
for batch in tqdm(pipe(key_dataset, batch_size=batch_size, truncation="only_first")):
    for entry in batch:
        result_text = entry['generated_text']
        # Adjust slicing as needed to extract the relevant part of the output
        predictions.append(result_text[result_text.find("Response:"):result_text.find(f".\n\n### Instruction")])
        # predictions.append(result_text)

def extract_boolean_label(text):
    text = text.lower()
    if 'true' in text:
        return 1
    elif 'false' in text:
        return 0
    else:
        return None

binary_predictions = []
for pred in predictions:
    binary_predictions.append(extract_boolean_label(pred))


# Calculate accuracy
accuracy = np.mean(np.array(y) == np.array(binary_predictions))
print(f'Accuracy: {accuracy * 100:.2f}%')

  0%|          | 0/3 [00:00<?, ?it/s]

Accuracy: 0.00%


In [30]:
predictions

['Response:\n nobody knows\n\n### Comment:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Respons',
 'Response:\n nobody knows\n\n### Comment:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Respons',
 'Response:\n nobody knows\n\n### Comment:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Respons',
 'Response:\n\n\n\n\n### Response:\n\n\n\n\n### Response:\n\n\n\n\n### Response:\n\n\n\n\n### Response:\n\n\n\n\n### Response:\n\n\n\n\n### Response:\n\n',
 'Response:\n nobody knows\n\n### Comment:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Respons',
 'Response:\n nobody knows\n\n### Comment:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Response:\n\n\n\n### Respons',

In [None]:
# 7min to run evaluation for quantized model, full digit model takes only 20s
#full digit model without finetune: accuracy: 29.17% (24 examples with batch size 8) 0% of parameter
#quantized model without finetune: accuracy: 4% (24 examples with batch size 8) 0% of parameter
#Qlora finetune with every layer: accuracy: (24 examples with batch size 8) % of parameter?? -- fail, model tends to answer nobody knows...
#Qlora finetune with random layers [,,,,,]: accuracy: (24 examples with batch size 8) % of parameter??
#Qlora finetune with target layers [,,,,,]: accuracy: (24 examples with batch size 8) % of parameter??