In [3]:
# Installing packages

!pip install -U transformers
!pip install -U datasets
!pip install -U accelerate
!pip install -U peft
!pip install -U trl
!pip install -U bitsandbytes
!pip install -U wandb



In [4]:
# Importing necessary packages for dataset, model, and tokenizer and fine-tuning

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

2024-07-02 04:48:43.357223: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 04:48:43.357357: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 04:48:43.494599: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrects = UserSecretsClient()

hf_token = user_secrects.get_secret("HUGINGFACE_TOKEN")

login(token = hf_token)

wb_token = user_secrects.get_secret("WANDB_TOKEN")

wandb.login(key = wb_token)
run = wandb.init(
    project = "Fine-tune LlaMA3 on medical dataset",
    job_type = "training",
    anonymous = "allow",
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtanisha109[0m ([33mtanisha109-PICT[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
base_model = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"
dataset_name = "ruslanmv/ai-medical-chatbot"
new_model = "llama-3-8b-chatdoc"

In [7]:
# Setting datatype and attention implementation
torch_dtype = torch.float16
attn_implementation = "eager"

In [8]:
# Loading the model. Due to memory constraints, we're loading 4-bit preicision model using QLoRA

#QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch_dtype
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = bnb_config,
    device_map = "auto",# allows the model to be automatically mapped to the available device (CPU or GPU).
    attn_implementation = attn_implementation
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

Fine-tuning the entire model will take a lot of time, so, to improve the training time, we'll attach the adapter layer with a few parameters, making the entire process more faster and memory-efficient

In [None]:
# LoRA config
peft_config = LoraConfig(
    r = 16,  # sets the reduction factor of the model
    lora_alpha = 32,  # sets the scale factor of the model
    lora_dropout = 0.05,  # sets the dropout rate, which is a regularization technique to prevent overfitting
    bias = "none",  
    task_type = "CAUSAL_LM",
    target_modules = ['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

model = get_peft_model(model, peft_config)

In [None]:
# Loading the dataset: we'll shuffle and select only top 1000 rows (reduces training time)
# Importing the dataset
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo

# Format chat template to make it conversational
def format_chat_template(row):
    row_json = [{"role": "user", "content": row["Patient"]},
               {"role": "assistant", "content": row["Doctor"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['text'][3]

In [None]:
# Split dataset into training and validation set
dataset = dataset.train_test_split(test_size = 0.1)

We are setting the model hyperparameters so that we can run it on the Kaggle. We are fine-tuning the model for one epoch and logging the metrics using the Weights and Biases.

In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

We’ll now set up a supervised fine-tuning (SFT) trainer and provide a train and evaluation dataset, LoRA configuration, training argument, tokenizer, and model. We’re keeping the max_seq_length to 512 to avoid exceeding GPU memory during training.

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
# Fine - tuning
trainer.train()

In [None]:
wandb.finish()
model.config.use_cache = True

In [None]:
# To generate a response, convert it into chat_template format

messages = [{
    "role": "user",
    "content": "Hello doctor, I burnt my hand. How to treat it immediately?"
}
]

prompt = tokenizer.apply_chat_template(messages, tokenize =False, add_generation_prompt = True)
inputs = tokenizer(prompt, return_tensors = 'pt', padding = True, truncation = True).to("cuda")
outputs = model.generate(**inputs, max_length=200, num_return_sequences = 1)

text = tokenizer.decode(outputs[0],  skip_special_tokens = True)

print(text.split("assistant")[1])

In [None]:
# Saving model file
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)