In [10]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [9]:
!unzip llms.zip

Archive:  llms.zip
  inflating: lora_llama7b_alpaca_ft/adapter_config.json  
  inflating: lora_llama7b_alpaca_ft/adapter_model.safetensors  
  inflating: lora_llama7b_alpaca_ft/README.md  
  inflating: lora_mistral/adapter_config.json  
  inflating: lora_mistral/adapter_model.safetensors  
  inflating: lora_mistral/README.md  
  inflating: phi2-2b-alpaca/adapter_config.json  
  inflating: phi2-2b-alpaca/adapter_model.safetensors  
  inflating: phi2-2b-alpaca/README.md  


In [11]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth



## LLAMA2 7B Training Code

In [13]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-2-7b-bnb-4bit", # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.2
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Unsloth 2024.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [14]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

def formatting_prompts_func_test(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = ['']*len(examples['input'])
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset_raw = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset_raw.map(formatting_prompts_func, batched = True,)

eval_dataset = dataset_raw.map(formatting_prompts_func_test, batched = True,)


Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [None]:
## TRAINING
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",

    ),
)

In [None]:
## TRAINING

trainer_stats = trainer.train()
model.save_pretrained("llama7b_alpaca_ft") # Local saving

## Mistral Training Code

In [None]:
## TRAINING

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

NameError: name 'FastLanguageModel' is not defined

In [None]:
## TRAINING

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

NameError: name 'tokenizer' is not defined

In [None]:
## TRAINING

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

In [None]:
## TRAINING

trainer_stats = trainer.train()
model.save_pretrained("lora_mistral")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss


## Phi2 Training

In [None]:
!pip install -q -U transformers datasets accelerate peft trl bitsandbytes
!pip install einops

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

In [None]:
 # Model
base_model = "microsoft/phi-2"
#Fine-tune model name
new_model = "phi2-2b-alpaca"
#Load the Dataset from hugging face
#Tokenizer
#Load the tokenizer from Llama 2
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
#In Llama2 we dont have the padding token which is a very big problem, because we have a dataset with different number of tokens in each row.
#So, we need to pad it so they all have the same length and here i am using end of sentence token and this will have an impact on the generation of our model
#I am using End of Sentence token for fine-tuning
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

print(dataset[0])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.', 'input': '', 'instruction': 'Give three tips for staying healthy.', 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the r

In [None]:
## TRAINING

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

def formatting_prompts_func_test(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = [''] * len(examples["input"])
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset_raw = load_dataset("yahma/alpaca-cleaned", split = "train")

# Calculate the number of samples in the training set
train_size = int(0.8 * len(dataset))

# Create the training dataset
train_dataset = dataset.select(range(train_size))

# Create the testing dataset
test_dataset = dataset.select(range(train_size, len(dataset)))

train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)

test_dataset = test_dataset.map(formatting_prompts_func_test, batched = True,)


Map:   0%|          | 0/16017 [00:00<?, ? examples/s]

Map:   0%|          | 0/4005 [00:00<?, ? examples/s]

In [None]:
train_dataset[0]

In [None]:
test_dataset[0]

In [None]:
## TRAINING

#Configration of QLoRA
#Quantization Configuration
#To reduce the VRAM usage we will load the model in 4 bit precision and we will do quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    #Quant type
    #We will use the "nf4" format this was introduced in the QLoRA paper
    bnb_4bit_quant_type="nf4",
    #As the model weights are stored using 4 bits and when we want to compute its only going to use 16 bits so we have more accuracy
    bnb_4bit_compute_dtype=torch.float16,
    #Quantization parameters are quantized
    bnb_4bit_use_double_quant=False,
)


# LoRA configuration
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["Wqkv", "fc1", "fc2" ] # ["Wqkv", "out_proj", "fc1", "fc2" ], - 41M params
    # modules_to_save=["embed_tokens","lm_head"]
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    # use_flash_attention_2=True, # Phi does not support yet.
    trust_remote_code=True,
    flash_attn=True,
    flash_rotary=True,
    fused_dense=True,
    low_cpu_mem_usage=True,
    device_map={"": 0},
    revision="refs/pr/23",
)


model.config.use_cache = False
model.config.pretraining_tp = 1

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
#prepare_model_for_kbit_training---> This function basically helps to built the best model possible
model = prepare_model_for_kbit_training(model,use_gradient_checkpointing=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [None]:
## TRAINING

# Set training arguments
training_arguments = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,#3,5 good for the Llama 2 Model
        per_device_train_batch_size=8,# Number of batches that we are going to take for every step
        gradient_accumulation_steps=32,
        evaluation_strategy="steps",#Not helpful because we donot want to evaluate the model we just want to train it
        eval_steps=2000,
        logging_steps=25,
        optim="paged_adamw_8bit",#Adam Optimizer we will be using but a version that is paged and in 8 bits, so it will lose less memory
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_steps=10,
        warmup_ratio=0.05,
        # report_to="tensorboard",
        weight_decay=0.01,
        max_steps=-1, # if maximum steps=2, it will stop after two steps
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,#No separate evaluation dataset, i am using the same dataset
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,# In dataset creation we put a threshold 2k for context length (input token limit) but we dont have enough VRAM unfortunately it will take a lot of VRAM to put everything into memory so we are just gonna stop at 512
    tokenizer=tokenizer,
    args=training_arguments,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
# Empty VRAM
del model
del trainer
import gc
gc.collect()

## Evaluation

LLaMA2 Eval

In [15]:
def make_inst(dataset):
  instr_list = []

  for i in range(len(dataset)):
    instr_list.append(alpaca_prompt.format(
        dataset[i]['instruction'], # instruction
        dataset[i]['input'], # input
        "", # output - leave this blank for generation!
    ))
  print(len(instr_list))
  return instr_list

if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_llama7b_alpaca_ft", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
EOS_TOKEN = tokenizer.eos_token

list_dataset = make_inst(dataset)
llama_responses = []
num_samples = 20

for i in range(num_samples):
    inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    llama_responses.append(tokenizer.batch_decode(outputs))


==((====))==  Unsloth: Fast Llama patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
51760


In [22]:
import gc
gc.collect()

58713

Mistral eval

In [16]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_mistral", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

EOS_TOKEN = tokenizer.eos_token
# dataset = make_inst(dataset)
mistral_responses = []

for i in range(num_samples):
    inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    mistral_responses.append(tokenizer.batch_decode(outputs))


config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [21]:
gc.collect()

NameError: name 'gc' is not defined

In [23]:
from transformers import AutoModelForCausalLM

base_model = "microsoft/phi-2"

# Reload model in FP16 and merge it with LoRA weights
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float32,
    device_map={"": 0},
)
#Reload the Base Model and load the QLoRA adapters
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
EOS_TOKEN = tokenizer.eos_token
# Run text generation pipeline with our model
#Input Prompt

#prompt = "What is a large language model?"
#Wrap the prompt using the right chat template
phi_responses = []

for i in range(num_samples):
    inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    phi_responses.append(tokenizer.batch_decode(outputs))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 500.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 165.06 MiB is free. Process 2519 has 14.58 GiB memory in use. Of the allocated memory 14.03 GiB is allocated by PyTorch, and 415.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
gc.collect()

In [25]:
!pip install nltk rouge bert-score


NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [29]:
from nltk.translate.bleu_score import sentence_bleu
print('Reference:',dataset[i]['text'].split(' '),'\n\nCandidate',llama_responses[i][0].split(' '))
bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], llama_responses[i][0].split(' '))
print(bleu_score,'\n\n')

Reference: ['Below', 'is', 'an', 'instruction', 'that', 'describes', 'a', 'task,', 'paired', 'with', 'an', 'input', 'that', 'provides', 'further', 'context.', 'Write', 'a', 'response', 'that', 'appropriately', 'completes', 'the', 'request.\n\n###', 'Instruction:\nWhat', 'does', 'DNA', 'stand', 'for?\n\n###', 'Input:\n\n\n###', 'Response:\nDNA', 'stands', 'for', 'Deoxyribonucleic', 'Acid.', 'It', 'is', 'the', 'molecule', 'that', 'carries', 'the', 'genetic', 'instructions', 'used', 'in', 'the', 'growth,', 'development,', 'functioning,', 'and', 'reproduction', 'of', 'all', 'living', 'organisms.</s>'] 

Candidate ['<s>Below', 'is', 'an', 'instruction', 'that', 'describes', 'a', 'task,', 'paired', 'with', 'an', 'input', 'that', 'provides', 'further', 'context.', 'Write', 'a', 'response', 'that', 'appropriately', 'completes', 'the', 'request.\n\n###', 'Instruction:\nWhat', 'does', 'DNA', 'stand', 'for?\n\n###', 'Input:\n\n\n###', 'Response:\nDNA', 'stands', 'for', 'deoxyribonucleic', 'acid,'

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from bert_score import score as bert_score
import numpy as np

# llama_scores = {'bleu':[],'rouge':[],'BertF1':[]}
# mistral_scores = {'bleu':[],'rouge':[],'BertF1':[]}
# phi_scores = {'bleu':[],'rouge':[],'BertF1':[]}
model_scores = {'LLaMA2 7B':{'bleu':[],'rouge':[],'BertF1':[]},'Mistral':{'bleu':[],'rouge':[],'BertF1':[]},'Phi2':{'bleu':[],'rouge':[],'BertF1':[]}}

for i in range(num_samples):

    # LLaMA2 Evaluation Code
    bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], llama_responses[i][0].split(' '))
    # print(f"BLEU Score: {bleu_score}")

    rouge = Rouge()

    rouge_scores = rouge.get_scores(dataset[i]['text'], llama_responses[i][0], avg=True)
    # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
    P, R, F1 = bert_score([dataset[i]['text']], llama_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

    model_scores['LLaMA2 7B']['bleu'].append(bleu_score)
    model_scores['LLaMA2 7B']['rouge'].append(rouge_scores['rouge-l']['f'])
    model_scores['LLaMA2 7B']['BertF1'].append(F1.cpu().detach().numpy()[0])

    bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], mistral_responses[i][0].split(' '))
    # print(f"BLEU Score: {bleu_score}")

    rouge_scores = rouge.get_scores(dataset[i]['text'], mistral_responses[i][0], avg=True)
    # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
    P, R, F1 = bert_score([dataset[i]['text']], mistral_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

    model_scores['Mistral']['bleu'].append(bleu_score)
    model_scores['Mistral']['rouge'].append(rouge_scores['rouge-l']['f'])
    model_scores['Mistral']['BertF1'].append(F1.cpu().detach().numpy()[0])

    bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], phi_responses[i][0].split(' '))

    rouge_scores = rouge.get_scores(dataset[i]['text'], phi_responses[i][0], avg=True)
    # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
    P, R, F1 = bert_score([dataset[i]['text']], phi_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

    model_scores['Phi2']['bleu'].append(bleu_score)
    model_scores['Phi2']['rouge'].append(rouge_scores['rouge-l']['f'])
    model_scores['Phi2']['BertF1'].append(F1.cpu().detach().numpy()[0])


print(f"{'Model':<10} | {'Bleu Score':<10} | {'Rouge Score':<12} | {'BertF1':<10}")
print('-'*45)
for model, scores in model_scores.items():
    print(f"{model:<10} | {np.array(scores['bleu']).mean():<10.5f} | {np.array(scores['rouge']).mean():<12.5f} | {np.array(scores['BertF1']).mean():<10.5f}")


Model      | Bleu Score | Rouge Score  | BertF1    
---------------------------------------------
LLaMA2 7B  | 0.00000    | 0.62821      | 0.80228   
Mistral    | 0.00000    | 0.63366      | 0.79927   
Phi2       | 0.00000    | 0.63741      | 0.80384   


In [None]:
for i in range(num_samples):
    print('Ground Truth:',dataset[i]['text'],'\n')
    print('LLaMA2 7B Response:',llama_responses[i],'\n')
    print('Mistral Response:',mistral_responses[i],'\n')
    print('Phi2 Response:',phi_responses[i],'\n')
    print('-'*150)

Human checked responses. LLAMA: 0.9 Mistral: 0.92 Phi2: 0.92

Temperature Evaluation

In [None]:
# Define the parameters to try
temperature_values = [0.001, 0.25, 0.5, 0.75]

for tv in temperature_values:
    if True:
        from unsloth import FastLanguageModel
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = "lora_llama7b_alpaca_ft", # YOUR MODEL YOU USED FOR TRAINING
            max_seq_length = max_seq_length,
            dtype = dtype,
            load_in_4bit = load_in_4bit,
        )
        FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    EOS_TOKEN = tokenizer.eos_token

    list_dataset = make_inst(dataset)
    llama_responses = []
    num_samples = 20

    for i in range(num_samples):
        inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, temperature=tv,do_sample=True)
        llama_responses.append(tokenizer.batch_decode(outputs))

    if True:
        from unsloth import FastLanguageModel
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = "lora_mistral", # YOUR MODEL YOU USED FOR TRAINING
            max_seq_length = max_seq_length,
            dtype = dtype,
            load_in_4bit = load_in_4bit,
        )
        FastLanguageModel.for_inference(model) # Enable native 2x faster inference

    EOS_TOKEN = tokenizer.eos_token
    # dataset = make_inst(dataset)
    mistral_responses = []

    for i in range(num_samples):
        inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, temperature=tv,do_sample=True)
        mistral_responses.append(tokenizer.batch_decode(outputs))

    # Reload model in FP16 and merge it with LoRA weights
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float32,
        device_map={"": 0},
    )
    #Reload the Base Model and load the QLoRA adapters
    model = PeftModel.from_pretrained(model, new_model)
    model = model.merge_and_unload()

    # Reload tokenizer to save it
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    EOS_TOKEN = tokenizer.eos_token
    # Run text generation pipeline with our model
    #Input Prompt

    #prompt = "What is a large language model?"
    #Wrap the prompt using the right chat template
    phi_responses = []

    for i in range(num_samples):
        inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, temperature=tv,do_sample=True)
        phi_responses.append(tokenizer.batch_decode(outputs))

    from nltk.translate.bleu_score import sentence_bleu
    from rouge import Rouge
    from bert_score import score as bert_score
    import numpy as np

    # llama_scores = {'bleu':[],'rouge':[],'BertF1':[]}
    # mistral_scores = {'bleu':[],'rouge':[],'BertF1':[]}
    # phi_scores = {'bleu':[],'rouge':[],'BertF1':[]}
    model_scores = {'LLaMA2 7B':{'bleu':[],'rouge':[],'BertF1':[]},'Mistral':{'bleu':[],'rouge':[],'BertF1':[]},'Phi2':{'bleu':[],'rouge':[],'BertF1':[]}}

    for i in range(num_samples):

        # LLaMA2 Evaluation Code
        bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], llama_responses[i][0].split(' '))
        # print(f"BLEU Score: {bleu_score}")

        rouge = Rouge()

        rouge_scores = rouge.get_scores(dataset[i]['text'], llama_responses[i][0], avg=True)
        # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
        P, R, F1 = bert_score([dataset[i]['text']], llama_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

        model_scores['LLaMA2 7B']['bleu'].append(bleu_score)
        model_scores['LLaMA2 7B']['rouge'].append(rouge_scores['rouge-l']['f'])
        model_scores['LLaMA2 7B']['BertF1'].append(F1.cpu().detach().numpy()[0])

        bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], mistral_responses[i][0].split(' '))
        # print(f"BLEU Score: {bleu_score}")

        rouge_scores = rouge.get_scores(dataset[i]['text'], mistral_responses[i][0], avg=True)
        # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
        P, R, F1 = bert_score([dataset[i]['text']], mistral_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

        model_scores['Mistral']['bleu'].append(bleu_score)
        model_scores['Mistral']['rouge'].append(rouge_scores['rouge-l']['f'])
        model_scores['Mistral']['BertF1'].append(F1.cpu().detach().numpy()[0])

        bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], phi_responses[i][0].split(' '))

        rouge_scores = rouge.get_scores(dataset[i]['text'], phi_responses[i][0], avg=True)
        # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
        P, R, F1 = bert_score([dataset[i]['text']], phi_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

        model_scores['Phi2']['bleu'].append(bleu_score)
        model_scores['Phi2']['rouge'].append(rouge_scores['rouge-l']['f'])
        model_scores['Phi2']['BertF1'].append(F1.cpu().detach().numpy()[0])

    print('TV:',tv)
    print(f"{'Model':<10} | {'Bleu Score':<10} | {'Rouge Score':<12} | {'BertF1':<10}")
    print('-'*45)
    for model, scores in model_scores.items():
        print(f"{model:<10} | {np.array(scores['bleu']).mean():<10.5f} | {np.array(scores['rouge']).mean():<12.5f} | {np.array(scores['BertF1']).mean():<10.5f}")


Beam Size Tests

In [None]:
# Define the parameters to try
beam_sizes = [1, 2, 4, 8]

for bs in beam_sizes:
    if True:
        from unsloth import FastLanguageModel
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = "lora_llama7b_alpaca_ft", # YOUR MODEL YOU USED FOR TRAINING
            max_seq_length = max_seq_length,
            dtype = dtype,
            load_in_4bit = load_in_4bit,
        )
        FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    EOS_TOKEN = tokenizer.eos_token

    list_dataset = make_inst(dataset)
    llama_responses = []
    num_samples = 20

    for i in range(num_samples):
        inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, num_beams=bs,do_sample=True)
        llama_responses.append(tokenizer.batch_decode(outputs))

    if True:
        from unsloth import FastLanguageModel
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = "lora_mistral", # YOUR MODEL YOU USED FOR TRAINING
            max_seq_length = max_seq_length,
            dtype = dtype,
            load_in_4bit = load_in_4bit,
        )
        FastLanguageModel.for_inference(model) # Enable native 2x faster inference

    EOS_TOKEN = tokenizer.eos_token
    # dataset = make_inst(dataset)
    mistral_responses = []

    for i in range(num_samples):
        inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, num_beams=bs,do_sample=True)
        mistral_responses.append(tokenizer.batch_decode(outputs))

    # Reload model in FP16 and merge it with LoRA weights
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float32,
        device_map={"": 0},
    )
    #Reload the Base Model and load the QLoRA adapters
    model = PeftModel.from_pretrained(model, new_model)
    model = model.merge_and_unload()

    # Reload tokenizer to save it
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    EOS_TOKEN = tokenizer.eos_token
    # Run text generation pipeline with our model
    #Input Prompt

    #prompt = "What is a large language model?"
    #Wrap the prompt using the right chat template
    phi_responses = []

    for i in range(num_samples):
        inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, num_beams=bs, do_sample=True)
        phi_responses.append(tokenizer.batch_decode(outputs))

    from nltk.translate.bleu_score import sentence_bleu
    from rouge import Rouge
    from bert_score import score as bert_score
    import numpy as np

    # llama_scores = {'bleu':[],'rouge':[],'BertF1':[]}
    # mistral_scores = {'bleu':[],'rouge':[],'BertF1':[]}
    # phi_scores = {'bleu':[],'rouge':[],'BertF1':[]}
    model_scores = {'LLaMA2 7B':{'bleu':[],'rouge':[],'BertF1':[]},'Mistral':{'bleu':[],'rouge':[],'BertF1':[]},'Phi2':{'bleu':[],'rouge':[],'BertF1':[]}}

    for i in range(num_samples):

        # LLaMA2 Evaluation Code
        bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], llama_responses[i][0].split(' '))
        # print(f"BLEU Score: {bleu_score}")

        rouge = Rouge()

        rouge_scores = rouge.get_scores(dataset[i]['text'], llama_responses[i][0], avg=True)
        # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
        P, R, F1 = bert_score([dataset[i]['text']], llama_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

        model_scores['LLaMA2 7B']['bleu'].append(bleu_score)
        model_scores['LLaMA2 7B']['rouge'].append(rouge_scores['rouge-l']['f'])
        model_scores['LLaMA2 7B']['BertF1'].append(F1.cpu().detach().numpy()[0])

        bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], mistral_responses[i][0].split(' '))
        # print(f"BLEU Score: {bleu_score}")

        rouge_scores = rouge.get_scores(dataset[i]['text'], mistral_responses[i][0], avg=True)
        # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
        P, R, F1 = bert_score([dataset[i]['text']], mistral_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

        model_scores['Mistral']['bleu'].append(bleu_score)
        model_scores['Mistral']['rouge'].append(rouge_scores['rouge-l']['f'])
        model_scores['Mistral']['BertF1'].append(F1.cpu().detach().numpy()[0])

        bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], phi_responses[i][0].split(' '))

        rouge_scores = rouge.get_scores(dataset[i]['text'], phi_responses[i][0], avg=True)
        # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
        P, R, F1 = bert_score([dataset[i]['text']], phi_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

        model_scores['Phi2']['bleu'].append(bleu_score)
        model_scores['Phi2']['rouge'].append(rouge_scores['rouge-l']['f'])
        model_scores['Phi2']['BertF1'].append(F1.cpu().detach().numpy()[0])

    print('Beam Size:', bs)
    print(f"{'Model':<10} | {'Bleu Score':<10} | {'Rouge Score':<12} | {'BertF1':<10}")
    print('-'*45)
    for model, scores in model_scores.items():
        print(f"{model:<10} | {np.array(scores['bleu']).mean():<10.5f} | {np.array(scores['rouge']).mean():<12.5f} | {np.array(scores['BertF1']).mean():<10.5f}")


Tok_k Tests

In [None]:
# Define the parameters to try
tok_ks = [ 2, 4, 8, 16]

for tk in tok_ks:
    if True:
        from unsloth import FastLanguageModel
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = "lora_llama7b_alpaca_ft", # YOUR MODEL YOU USED FOR TRAINING
            max_seq_length = max_seq_length,
            dtype = dtype,
            load_in_4bit = load_in_4bit,
        )
        FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    EOS_TOKEN = tokenizer.eos_token

    list_dataset = make_inst(dataset)
    llama_responses = []
    num_samples = 20

    for i in range(num_samples):
        inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, top_k=tk, do_sample=True)
        llama_responses.append(tokenizer.batch_decode(outputs))

    if True:
        from unsloth import FastLanguageModel
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = "lora_mistral", # YOUR MODEL YOU USED FOR TRAINING
            max_seq_length = max_seq_length,
            dtype = dtype,
            load_in_4bit = load_in_4bit,
        )
        FastLanguageModel.for_inference(model) # Enable native 2x faster inference

    EOS_TOKEN = tokenizer.eos_token
    # dataset = make_inst(dataset)
    mistral_responses = []

    for i in range(num_samples):
        inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, top_k=tk, do_sample=True)
        mistral_responses.append(tokenizer.batch_decode(outputs))

    # Reload model in FP16 and merge it with LoRA weights
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float32,
        device_map={"": 0},
    )
    #Reload the Base Model and load the QLoRA adapters
    model = PeftModel.from_pretrained(model, new_model)
    model = model.merge_and_unload()

    # Reload tokenizer to save it
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    EOS_TOKEN = tokenizer.eos_token
    # Run text generation pipeline with our model
    #Input Prompt

    #prompt = "What is a large language model?"
    #Wrap the prompt using the right chat template
    phi_responses = []

    for i in range(num_samples):
        inputs = tokenizer(list_dataset[i], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True, top_k=tk, do_sample=True)
        phi_responses.append(tokenizer.batch_decode(outputs))

    from nltk.translate.bleu_score import sentence_bleu
    from rouge import Rouge
    from bert_score import score as bert_score
    import numpy as np

    # llama_scores = {'bleu':[],'rouge':[],'BertF1':[]}
    # mistral_scores = {'bleu':[],'rouge':[],'BertF1':[]}
    # phi_scores = {'bleu':[],'rouge':[],'BertF1':[]}
    model_scores = {'LLaMA2 7B':{'bleu':[],'rouge':[],'BertF1':[]},'Mistral':{'bleu':[],'rouge':[],'BertF1':[]},'Phi2':{'bleu':[],'rouge':[],'BertF1':[]}}

    for i in range(num_samples):

        # LLaMA2 Evaluation Code
        bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], llama_responses[i][0].split(' '))
        # print(f"BLEU Score: {bleu_score}")

        rouge = Rouge()

        rouge_scores = rouge.get_scores(dataset[i]['text'], llama_responses[i][0], avg=True)
        # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
        P, R, F1 = bert_score([dataset[i]['text']], llama_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

        model_scores['LLaMA2 7B']['bleu'].append(bleu_score)
        model_scores['LLaMA2 7B']['rouge'].append(rouge_scores['rouge-l']['f'])
        model_scores['LLaMA2 7B']['BertF1'].append(F1.cpu().detach().numpy()[0])

        bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], mistral_responses[i][0].split(' '))
        # print(f"BLEU Score: {bleu_score}")

        rouge_scores = rouge.get_scores(dataset[i]['text'], mistral_responses[i][0], avg=True)
        # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
        P, R, F1 = bert_score([dataset[i]['text']], mistral_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

        model_scores['Mistral']['bleu'].append(bleu_score)
        model_scores['Mistral']['rouge'].append(rouge_scores['rouge-l']['f'])
        model_scores['Mistral']['BertF1'].append(F1.cpu().detach().numpy()[0])

        bleu_score = sentence_bleu([dataset[i]['text'].split(' ')], phi_responses[i][0].split(' '))

        rouge_scores = rouge.get_scores(dataset[i]['text'], phi_responses[i][0], avg=True)
        # print(f"Rouge-L Score: {rouge_scores['rouge-l']['f']}")
        P, R, F1 = bert_score([dataset[i]['text']], phi_responses[i], lang="en", verbose=False, model_type='bert-base-uncased')

        model_scores['Phi2']['bleu'].append(bleu_score)
        model_scores['Phi2']['rouge'].append(rouge_scores['rouge-l']['f'])
        model_scores['Phi2']['BertF1'].append(F1.cpu().detach().numpy()[0])

    print('Top K:',tk)
    print(f"{'Model':<10} | {'Bleu Score':<10} | {'Rouge Score':<12} | {'BertF1':<10}")
    print('-'*45)
    for model, scores in model_scores.items():
        print(f"{model:<10} | {np.array(scores['bleu']).mean():<10.5f} | {np.array(scores['rouge']).mean():<12.5f} | {np.array(scores['BertF1']).mean():<10.5f}")
