In [1]:
%pip install -U transformers datasets accelerate peft bitsandbytes
%pip install -U python-dotenv



In [2]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    Trainer
)
from datasets import Dataset
from peft import LoraConfig
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import random

In [3]:
print(torch.cuda.is_available())  # Should return True

True


In [4]:
def set_seed(seed):
    """Set seed for reproducibility"""
    # Set seed for Python's built-in random module
    random.seed(seed)

    # Set seed for numpy
    np.random.seed(seed)

    # Set seed for PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False  # disable to ensure reproducibility


def load_tsv_dataset(file_path):
    """
    Load the TSV file containing reviews and responses.
    """
    df = pd.read_csv(file_path, sep="\t")[:256]
    return Dataset.from_pandas(df)

def preprocess_function(data, tokenizer):
    """
    Tokenize the input reviews and responses for fine-tuning.
    """
    # from https://github.com/meta-llama/llama/blob/main/llama/generation.py#L45
    # https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-2/
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
    B_INST, E_INST = "[INST]", "[/INST]"

    system_prompt = "You are a helpful assistant for a business. " \
        + "You are given a set of Amazon reviews for a given item, grouped by their ratings out of 5, " \
        + "and tasked with providing actionable feedback to help improve this item. " \
        + "Please format your response into concise sentences, one for each actionable feedback. " \
        + "Place each feedback on a bulletpoint."

    inputs = []
    for user_prompt in data["Reviews"]:
        input_text = f"<s>{B_INST} {B_SYS}{system_prompt}{E_SYS}{user_prompt} {E_INST}"
        inputs.append(input_text)

    reference = data["Reference"]

    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=2048)
    labels = tokenizer(reference, truncation=True, padding="max_length", max_length=2048).input_ids

    model_inputs["labels"] = labels
    return model_inputs


# Step 3: Fine-Tune the Model with LoRA
def finetune_model(model_name, dataset, HF_TOKEN, output_dir="./finetuned-llama2", num_epochs=5):
    """
    Fine-tune the LLaMA model using LoRA.
    """
    # reference from https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing#scrollTo=OJXpOgBFuSrc

    # NousResearch Llama already initialized pad tokens
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,  # Enable 4-bit quantization
            bnb_4bit_compute_dtype=torch.float16,  # Default compute dtype
            bnb_4bit_use_double_quant=True,  # Enable double quantization
            bnb_4bit_quant_type="nf4",  # Use NF4 quantization (recommended)
        ),
        device_map={"": 0},
        offload_folder="./offload",  # Specify a directory for offloaded layers
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    lora_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model.add_adapter(lora_config)

    # Tokenize the dataset
    tokenized_dataset = dataset.map(lambda examples: preprocess_function(examples, tokenizer), batched=True)
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

    # Set training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        gradient_accumulation_steps=4,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        save_strategy="epoch",
        save_steps=1000,
        save_total_limit=1,
        logging_dir="./logs",
        logging_steps=100,
        report_to="tensorboard",
        remove_unused_columns=True,
        warmup_steps=10
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # Fine-tune the model
    trainer.train()

    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model fine-tuned and saved at {output_dir}")

    model.push_to_hub("RichardNooooh/Llama-2-7B-amazonfeedback-baseline", use_auth_token=HF_TOKEN, private=True)
    tokenizer.push_to_hub("RichardNooooh/Llama-2-7B-amazonfeedback-baseline", use_auth_token=HF_TOKEN, private=True)
    print("Model pushed to hub.")

# Main Script
if __name__ == "__main__":
    set_seed(42)

    tsv_file = "./data/baseline_train.tsv"
    load_dotenv("./.env")
    HF_TOKEN = os.getenv("HF_TOKEN")

    assert HF_TOKEN is not None and len(HF_TOKEN) > 0

    os.environ['WANDB_DISABLED'] = 'true'

    dataset = load_tsv_dataset(tsv_file)
    split_dataset = dataset.train_test_split(test_size=0.2) # train and eval

    model_name = "NousResearch/Llama-2-7b-chat-hf"

    finetune_model(model_name, split_dataset, HF_TOKEN)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

  trainer = Trainer(
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch,Training Loss,Validation Loss
1,No log,8.371109
2,10.132800,2.058594
3,10.132800,1.104029
4,1.185500,0.998861
5,1.185500,0.972173


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Model fine-tuned and saved at ./finetuned-llama2




adapter_model.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Model pushed to hub.
