In [None]:
!pip install -U transformers 
!pip install -U datasets 
!pip install -U accelerate 
!pip install -U peft 
!pip install -U trl 
!pip install -U bitsandbytes 
!pip install -U wandb

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")

login(token = hf_token)

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3.2 1B instruct on BKAI RAG Dataset', 
    job_type="training", 
    anonymous="allow"
)

In [None]:
base_model = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
new_model = "llama-3.2-1b-instruct-finetune-bkai-rag"

In [None]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [None]:
import pandas as pd
train_dataset = pd.read_csv('/kaggle/input/dsc-rag-dataset/data/train.csv')

In [None]:
from tqdm import tqdm
total_questions = []
total_contexts = []
total_cids = []
total_qids = []
for idx,row in tqdm(train_dataset.iterrows()):
    texts = row['context'].split('\n ')
    cids = [int(x) for x in row['cid'].strip('[]').split(' ') if x]
    for cid,text in zip(cids,texts):
        total_questions.append(row['question'])
        total_contexts.append(text.strip('[]"\'“” '))
        total_cids.append(cid)
        total_qids.append(row['qid'])

In [None]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
total_train_dataset = pd.DataFrame({'question':total_questions,'context':total_contexts,'cid':total_cids,'qid':total_qids})

my_dataset = Dataset.from_dict({'question':total_questions,'context':total_contexts})

In [None]:
def format_chat_template(row):
    row_json = [{"role": "user", "content": row["question"]},
               {"role": "assistant", "content": row["context"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = my_dataset.map(
    format_chat_template,
    num_proc=4,
)
dataset = dataset.train_test_split(test_size=0.15)


In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)
trainer.train()

In [None]:
wandb.finish()
model.config.use_cache = True

In [None]:
messages = [
    {
        "role": "user",
        "content": total_questions[0]
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, 
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, 
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=256, 
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

In [None]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

MERGE WITH BASE

In [None]:
#Save notebook version tren kaggle
base_model = "/kaggle/input/llama-3.2/transformers/1b-instruct/1"
new_model = "/kaggle/input/finetune-llama3-2-1b-instruct-bkai/llama-3.2-1b-instruct-finetune-bkai-rag/"


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

# base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, new_model)

model = model.merge_and_unload()

In [None]:
public_test = pd.read_csv('/kaggle/input/dsc-rag-dataset/data/public_test.csv')
results = []
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)
for q in public_test['question']:
    messages = [{"role": "user", "content": q}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=120, do_sample=True, temperature=0.2, top_k=50, top_p=0.95)
    results.append(outputs)

In [None]:
model.save_pretrained("merged-llama-3.2-1b-instruct-finetune-bkai-rag")
tokenizer.save_pretrained("merged-llama-3.2-1b-instruct-finetune-bkai-rag")
model.push_to_hub("merged-llama-3.2-1b-instruct-finetune-bkai-rag", use_temp_dir=False)
tokenizer.push_to_hub("merged-llama-3.2-1b-instruct-finetune-bkai-rag", use_temp_dir=False)

PREDICT ON PUBLICTEST

import pandas as pd
public_test = pd.read_csv('/kaggle/input/dsc-rag-dataset/data/public_test.csv')

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
tokenizer = AutoTokenizer.from_pretrained("tiendoan/merged-llama-3.2-1b-instruct-finetune-bkai-rag")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("tiendoan/merged-llama-3.2-1b-instruct-finetune-bkai-rag").to('cuda')
results = []
for q in tqdm(public_test['question'][5000:]):
    messages = [
        {
            "role": "user",
            "content": q
        }
    ]
    
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, 
                                           add_generation_prompt=True)
    
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, 
                       truncation=True).to("cuda")
    
    outputs = model.generate(
    **inputs,
    max_length=200,
    num_return_sequences=1,
    no_repeat_ngram_size=4,
    repetition_penalty=1.5,
    top_k=50,
    top_p=0.9
    )

    
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    results.append(text.split("assistant")[1])

In [None]:
import torch
torch.save(results,'llama_preds.pt')