<a href="https://colab.research.google.com/github/Sabastain-Wakoyi/CS-6263-HW3/blob/main/NPL_Llama_AS3_Use.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/244.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:

model_name = "NousResearch/Llama-2-7b-chat-hf"


dataset_name = "flytech/python-codes-25k"

# Fine-tuned model name
new_model = "Llama-2-7b-chat-finetune"


# QLoRA parameters

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

# bitsandbytes parameters

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit
use_nested_quant = False

# TrainingArguments parameters


# Output directory
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an V100)
fp16 = False
bf16 = False


per_device_train_batch_size = 4

per_device_eval_batch_size = 4

gradient_accumulation_steps = 1

gradient_checkpointing = True

max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03


# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25


# SFT parameters

max_seq_length = None


packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# Load dataset and select 5% of the dataset
full_dataset = load_dataset(dataset_name, split="train")
dataset_size = len(full_dataset)
subset_size = int(0.05 * dataset_size)  # Calculate 5% of the dataset
dataset = full_dataset.select(range(subset_size))

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]



Map:   0%|          | 0/2481 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.7504
50,1.5631
75,0.9728
100,1.1061
125,0.9568
150,0.9478
175,0.8848
200,0.8622
225,0.8083
250,0.8795


TrainOutput(global_step=621, training_loss=0.8995860770896629, metrics={'train_runtime': 315.2038, 'train_samples_per_second': 7.871, 'train_steps_per_second': 1.97, 'total_flos': 4638520694169600.0, 'train_loss': 0.8995860770896629, 'epoch': 1.0})

In [None]:
# Save trained model
new_model = "./Llama-2-7b-chat-finetune"
trainer.model.save_pretrained(new_model)

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with  new model
prompt = "Write a paragrah about Nelson Mandela?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] Write a paragrah about Nelson Mandela? [/INST]  Nelson Mandela was a South African anti-apartheid revolutionary and politician who served as President of South Africa from 1994 to 1999. He was born on July 18, 1918, in Mvezo, South Africa, and was raised in a traditional Xhosa family. Mandela was educated at Fort Hare University and later became involved in the African National Congress (ANC), where he became a leader in the fight against apartheid. In 1962, he was arrested and sentenced to life in prison for his activism, but he was released in 1990 after international pressure. In 1994, he became the first black President of South Africa, serving until 1999. Mandela's presidency was marked by efforts to heal the country'


In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Write a python code to determine if a number is prime?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Write a python code to determine if a number is prime? [/INST]  Sure! Here's a Python code to determine if a number is prime:

def is_prime(n):
    if n < 2:
        return False
    for i in range(2, int(n ** 0.5) + 1):
        if n % i == 0:
            return False
    return True

# Example usage:
n = 23
print(is_prime(n))
```
This code uses a simple algorithm to check if a number is prime. It starts by checking if the number is less than 2, and then iterates from 2 to the square root of the number, checking if it is divisible by any of those numbers. If it's not divisible by any of those numbers, it's prime.

Note that this code


In [None]:
!pip install sacrebleu rouge-score bert-score

In [None]:
from datasets import load_metric
import sacrebleu
from rouge_score import rouge_scorer
from bert_score import BERTScorer


def evaluate(predictions, references):
    metrics = {
        "perplexity": None,
        "BLEU": sacrebleu.corpus_bleu(predictions, [references]).score,
        "ROUGE-L": load_metric("rouge").compute(predictions=predictions, references=references, use_stemmer=True)["rougeL"].mid.fmeasure,
        "BERTScore": BERTScorer(lang="en").score(references, predictions)[2].mean().item(),

    }
    return metrics


predictions = ["Nelson Mandela was a South African anti-apartheid revolutionary and politician who served as President of South Africa from 1994 to 1999. He was born on July 18, 1918, in Mvezo, South Africa, and was raised in a traditional Xhosa family. Mandela was educated at Fort Hare University and later became involved in the African National Congress (ANC), where he became a leader in the fight against apartheid. In 1962, he was arrested and sentenced to life in prison for his activism, but he was release "]
references = ["Nelson Mandela was a South African anti-apartheid revolutionary and politician who served as President of South Africa from 1994 to 1999. He was born on July 18, 1918, in Mvezo, South Africa, and was raised in a traditional Xhosa family. Mandela was educated at Fort Hare University and later became involved in the African National Congress (ANC), where he became a leader in the fight against apartheid. In 1962, he was arrested and sentenced to life in prison for his activism, but he was release"]
metrics = evaluate(predictions, references)
print(metrics)


In [None]:
# Print dataset
print(dataset)

Dataset({
    features: ['instruction', 'input', 'text', 'output'],
    num_rows: 2481
})


In [None]:
print(samples_for_evaluation)

Dataset({
    features: ['instruction', 'input', 'text', 'output'],
    num_rows: 20
})


In [None]:
from transformers import pipeline
import numpy as np


prompt = "Write a paragraph about Nelson Mandela :"

# hyperparameters to test
top_k_values = [20, 40, 60, 80]
beam_sizes = [1, 5, 10, 20]
temperatures = [0.5, 0.7, 1.0, 1.5]

# Initialize the model
model_name = "Llama-2-7b-chat-finetune"
generator = pipeline("text-generation", model=model_name)

# Function to generate text with given hyperparameters
def generate_text(prompt, top_k, beam_size, temperature):
    return generator(prompt, max_length=50, top_k=top_k, num_beams=beam_size, temperature=temperature, do_sample=True if top_k > 0 else False)[0]["generated_text"]

# Function to simulate evaluation
def evaluate_text(text):
    return np.random.rand()

# Experiment and collect results
results = []
for top_k in top_k_values:
    for beam_size in beam_sizes:
        for temperature in temperatures:
            generated_text = generate_text(prompt, top_k, beam_size, temperature)
            score = evaluate_text(generated_text)
            results.append(((top_k, beam_size, temperature), score, generated_text[:50]))  # Storing the first 50 chars for brevity

# Display results
for params, score, sample_text in sorted(results, key=lambda x: x[1], reverse=True):  # Sorting by score for demonstration
    print(f"Params (top_k, beam_size, temperature): {params} - Score: {score:.2f} - Sample Text: {sample_text}")


In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

0

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]