In [None]:
!pip -q install datasets peft accelerate bitsandbytes transformers wandb scikit-learn

In [None]:
import torch
import transformers
from datasets import load_dataset
from typing import Optional
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    PreTrainedModel,
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel,
)
from sklearn.model_selection import train_test_split
import sklearn
from huggingface_hub import login

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
login(token="hf_key")

In [None]:
MICRO_BATCH_SIZE = 8
BATCH_SIZE = 64
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 5
LEARNING_RATE = 2e-4
MAX_SEQ_LEN = 512
LORA_R = 4
LORA_ALPHA = 8
LORA_DROPOUT = 0.05

BASE_MODEL_NAME = "google/gemma-2-2b"
PRETRAINED_MODEL_DIR = "gemma"  #cpt model
OUTPUT_DIR = "GemmaSin-2-2b-QA"

In [None]:
#load base model configuration
config = AutoConfig.from_pretrained(BASE_MODEL_NAME)

In [None]:
#quantization setup
load_in_4bit = False
load_in_8bit = True
quantization_config: Optional[BitsAndBytesConfig] = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    load_in_8bit=load_in_8bit,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
) if load_in_4bit or load_in_8bit else None

In [None]:
#load the base model
base_model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    config=config,
    device_map="auto",
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
)

In [None]:
#load pre-trained LoRA model and merge weights
model = PeftModel.from_pretrained(base_model, PRETRAINED_MODEL_DIR)
model = model.merge_and_unload()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_NAME,
    add_eos_token=True,
)
tokenizer.pad_token_id = 0

In [None]:
#prepare model for training
model = prepare_model_for_kbit_training(model)

In [None]:
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
#load dataset
qa_dataset = load_dataset("Ransaka/aya_sinhala_subset")

In [None]:
print(qa_dataset)

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets', 'annotation_type', 'user_id'],
        num_rows: 14524
    })
})


In [None]:
if hasattr(qa_dataset, 'column_names'):
    print("Column names:", qa_dataset.column_names)

Column names: {'train': ['inputs', 'targets', 'annotation_type', 'user_id']}


In [None]:
df = qa_dataset['train']

In [None]:
# for i in range(min(3, len(df))):
#     print(f"Example {i+1}:", df[i])

In [None]:
#instruction format for Q&A
def format_qa_instruction(example):
    #extract question and answer
    question = example.get('inputs', example.get('Question', ''))
    answer = example.get('targets', example.get('Answer', ''))

    #create instruction format
    instruction = f"""පහත සදහන් ප්‍රශ්නයට නිවැරදි පිළිතුරක් ලබා දෙන්න. පිළිතුරු ලබා දීමේදී ප්‍රශ්නයේ ස්වභාවය අනුව - සරල ප්‍රශ්න සඳහා කෙටි පිළිතුරු ද, සංකීර්ණ ප්‍රශ්න සඳහා විස්තරාත්මක පැහැදිලි කිරීම් ද ලබා දෙන්න.
    ### ප්‍රශ්නය: {question}
    ### පිළිතුර: {answer}"""

    return instruction

In [None]:
def tokenize_qa(example):
    #format the instruction
    formatted_text = format_qa_instruction(example)

    #tokenize
    result = tokenizer(
        formatted_text,
        truncation=True,
        max_length=MAX_SEQ_LEN+1,
        padding="max_length",
    )

    return {
      "input_ids": result["input_ids"][:-1],
      "attention_mask": result["attention_mask"][:-1],
    }

In [None]:
#tokenized_dataset = df.shuffle().map(tokenize_qa, remove_columns=df.column_names)
tokenized_dataset = df.shuffle().map(lambda x: tokenize_qa(x), remove_columns=df.column_names)

In [None]:
#data split for training and validation
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

In [None]:
#setup trainer
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=300,
        output_dir=OUTPUT_DIR,
        save_total_limit=2,
        save_strategy="steps",
        eval_strategy="steps",
        eval_steps=300,
        save_steps=300,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        report_to="wandb",
        remove_unused_columns=False,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    ),
)

In [None]:
model.config.use_cache = False

In [None]:
#training
trainer.train()

In [None]:
#save model
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)