In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, PeftModel

In [None]:
BASE_MODEL_ID = "meta-llama/Llama-2-7b-hf"
NEW_MODEL_ID = "xujia118/backwards-llama2-7b-guanaco"
DATASET_ID = "xujia118/new_curated_lima"

In [None]:
!pip install -U bitsandbytes

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# 1. Load model & tokenizer

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# 2. Setup LoRA
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj",
                    "fc1", "fc2", "dense", "lm_head"],
    bias="none",
    lora_dropout=0.1,
    task_type="CAUSAL_LM",
)

# For first time fine-tuning
# model = get_peft_model(model, lora_config)

# To continue fine-tuning
model = PeftModel.from_pretrained(model, NEW_MODEL_ID)
model.train()
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True
model.print_trainable_parameters()

In [None]:

# 3. Load & process dataset
print(f"Loading and formatting dataset: {DATASET_ID}...")
dataset = load_dataset(DATASET_ID)
raw_dataset = dataset["train"].select(range(4)) # make a smoke test first


def format_instruction_examples(example):
    prompt = (
        "<s>[SYSTEM]\n"
        "You are a helpful and knowledgeable assistant.\n\n"

        "[USER]\n"
        f"{example['generated_instruction']}\n\n"

        "[ASSISTANT]\n"
        f"{example['output_text']}"
        "</s>"
    )
    return {"text": prompt}


raw_dataset = raw_dataset.map(format_instruction_examples)

# Split train/set
split_ds = raw_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_ds = split_ds["train"]
test_ds = split_ds["test"]


# Step 4. Tokenize
def tokenize_fn(example):
    tokenized = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    # For causal LM, labels are same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


train_ds = train_ds.map(tokenize_fn, remove_columns=["text"])
test_ds = test_ds.map(tokenize_fn, remove_columns=["text"])
# In real work, prep the dataset, including tokenization, before ssh into GPU instance


# Step 5. Setup Trainer
training_args = TrainingArguments(
    output_dir="new_checkpoints",

    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,

    lr_scheduler_type='cosine',
    optim="paged_adamw_8bit",

    logging_steps=4,

    save_strategy="steps",
    save_steps=4,
    save_total_limit=2,             # Keep last 2 checkpoints

    push_to_hub=True,
    hub_model_id=NEW_MODEL_ID,
    hub_strategy="checkpoint",

    eval_strategy="steps",
    eval_steps=4,
    do_eval=True,

    report_to="wandb",
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


trainer = Trainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    args=training_args,
    data_collator=data_collator
)


print("Starting training...")
trainer.train()

trainer.push_to_hub()