In [1]:
import os
import wandb
import pandas as pd
import torch
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import Dataset
from datetime import datetime
from typing import Dict, Any

# Initialization Settings

In [2]:
# Set environment variable to load the LLM
os.environ["TRANSFORMERS_OFFLINE"] = "1"
# Set notebook name for wandb (adjust as needed)
os.environ["WANDB_NOTEBOOK_NAME"] = "QLoRA_v3.0.ipynb"
# Set the WANDB_API_KEY environment variable
os.environ["WANDB_API_KEY"] = "YOUR WANDB API KEY"

In [3]:
# Training datasets
ds_train_id = "/home/czy/Project-1 LLMs Database/Richness/3/ds_train.xlsx"
c
# Test datasets
ds_test_id = "/home/czy/Project-1 LLMs Database/Richness/3/ds_test.xlsx"

# Pre-trained model (adjust as needed)
model_path = "/home/czy/Project-1 LLMs Database/meta_llama_2_7b_chat"

# Address for storaging model weight (adjust as needed)
adapter_path = "/home/czy/Project-1 LLMs Database/lora_adapter"

# Prompt template
prompt_template = """<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>

question: {user_query}
context: {context}[/INST] {response}</s>
"""

# Load base model and tokenizer of LLaMA2-7B-Chat

In [4]:
# Load tokenizer of LLaMA2-Chat-7B 
def load_tokenizer(pretrained_path: str) -> Any:
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path = pretrained_path,
        padding_side = "left",
        token = os.environ.get("TRANSFORMERS_OFFLINE"),
        local_files_only = True
    )
    return tokenizer


# Load pretrained model (LLaMA2-7B-Chat)
def load_pretrained_model(pretrained_path: str) -> Any:
    # Model quantization parameter settings
    model_bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.bfloat16,
        bnb_4bit_use_double_quant = False
    )
    
    # Load pretrained model
    base_model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path = pretrained_path,
        quantization_config = model_bnb_config,
        token = os.environ.get("TRANSFORMERS_OFFLINE"),
        device_map = "auto",
        low_cpu_mem_usage = True,
        local_files_only = True
    )
    return base_model

In [None]:
llama2_tokenizer = load_tokenizer(pretrained_path=model_path)
llama2_tokenizer.pad_token = llama2_tokenizer.eos_token  # Set the fill character to </s>, if set <unk>, model accuracy will reduce
llama2_tokenizer.add_eos_token = False  # Don't add </s> at the end of input
llama2_tokenizer.add_bos_token = False  # Don't add <s> at the start of input

llama2_pretrained = load_pretrained_model(pretrained_path=model_path)
llama2_pretrained.config.use_cache = False  # silence the warnings. Please re-enable for inference!
llama2 = prepare_model_for_kbit_training(llama2_pretrained)  # Note this method only works for transformers models.

# Preprocess datasets

In [6]:
# Tokenizing the inputs before inference
def generate_and_tokenize_prompt(data_point: Dict, tokenizer: Any = llama2_tokenizer, template: str = prompt_template) -> Any:
    system_prompt = data_point["instruction"]  # or ï»¿instruction
    user_query = data_point["user_query"]
    reference_context = data_point["context"]
    model_reponse = data_point["output"]

    # Filling prompt template with data point
    prompt = template.format(
        system_prompt=system_prompt, 
        user_query=user_query, 
        context=reference_context, 
        response=model_reponse
    )
    
    tokenized_input = tokenizer(
        prompt,
        truncation = True,  # If the input length exceeds the maximum length, cut off at the boundary
        max_length = 2048,
        padding = "max_length"
    )
    
    tokenized_input["labels"] = tokenized_input["input_ids"].copy()
    # tokenized_input["labels"] = torch.tensor(tokenized_input["input_ids"].copy(), dtype=torch.long)
    return tokenized_input

In [None]:
df_train = pd.read_excel(ds_train_id)
df_test = pd.read_excel(ds_test_id)
# Construct train datasets
df_train = df_train.sample(n=112, replace=False, random_state=66)  # n here represents the scale of training set
df_train.to_excel("../Richness/3/tr_112.xlsx")
ds_train = Dataset.from_pandas(df_train)
tokenized_ds_train = ds_train.map(generate_and_tokenize_prompt)
print(tokenized_ds_train)
# Construct test datasets
ds_test = Dataset.from_pandas(df_test)
tokenized_ds_test = ds_test.map(generate_and_tokenize_prompt)
print(tokenized_ds_test)

In [None]:
print(tokenized_ds_train[0])

# (If there is a demand) Check ds

In [None]:
pd.set_option('display.max_rows', None)  # display all rows
pd.set_option('display.max_columns', None)  # display all columns
pd.set_option('display.max_colwidth', None)  # display the full content of the cell
df_train

# Prepare Low-Rank Adapter (LoRA)

In [None]:
# Fully Sharded Data Parallel
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

# Set LoRA config
peft_config = LoraConfig(
    r = 8,  # Increasing r does not cover a more meaningful subspace, which suggests that a low-rank adaptation matrix is sufﬁcient
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"],  # it is preferable to adapt more weight matrices than adapting a single type of weights with a larger rank
    lora_alpha = 8,  # A scaling factor that is used to scale delta W when training
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM"
)

model_with_peft = get_peft_model(model=llama2, peft_config=peft_config)
model_with_peft = accelerator.prepare_model(model=model_with_peft)
model_with_peft.print_trainable_parameters()

if torch.cuda.device_count() > 1:
    model_with_peft.is_parallelizable = True
    model_with_peft.model_parallel = True

In [None]:
wandb.login(key=os.environ.get("WANDB_API_KEY"))
run = wandb.init(
    project = "Fine tuning llama-2-7B-chat",
    name = "r3ds112gn03wr03bs4ep8accelerator",  # adjust as needed
    job_type = "training",
    anonymous = "allow"
)

In [10]:
project = "journal-finetune"
base_model_name = "llama2-7b-chat"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

training_arguments = TrainingArguments(
    optim = "paged_adamw_8bit",             # defaults to "adamw_torch"
    weight_decay = 0.001,                   # prevent our model from getting too complex, wd * sum(square(all weights)), defaults to 0
    max_grad_norm = 0.3,                    # defaults 1.0
    warmup_ratio = 0.3,                     # defaults 0.0
    lr_scheduler_type = "linear",           # defaults "linear", optional "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"
    learning_rate = 2e-4,                   # defaults 5e-5 (custom 2e-4)
    num_train_epochs = 8,
    per_device_train_batch_size = 4,        # defaults 8
    gradient_accumulation_steps = 1,        # defaults 1
    eval_strategy = "steps",
    per_device_eval_batch_size = 4,         # defaults 8
    eval_accumulation_steps = 1,            # defaults 1
    eval_steps = 1,
    fp16 = False,
    bf16 = False,
    seed = 66,
    report_to = "wandb",
    run_name = f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
    output_dir = output_dir,
    save_strategy = "epoch",
    logging_steps = 1
)

trainer = Trainer(
    model = model_with_peft,
    train_dataset = tokenized_ds_train,  
    eval_dataset = tokenized_ds_test,
    args = training_arguments
)

# Training LLaMA2-7B-Chat by QLoRA

In [None]:
trainer.train()

In [None]:
trainer.model.save_pretrained(adapter_path)

In [14]:
wandb.finish()