# FineTune Llama 2

This notebook leverages QLoRA for finetuning of meta-llama/Llama-2-7b-chat-hf to answer questions about oncology based on the ontology released by [HemOnc.org](https://hemonc.org/wiki/Main_Page).

This is the largest freely available medical wiki of interventions, regimens, and general information relevant to the fields of hematology and oncology. It is designed for easy use and intended for healthcare professionals.

**This notebook is meant to be run using google collab GPU runtimes**

**This notebook wasn't used to train the model used by the app. Refer to the train.py file for the actual training process. It is intended to showcase how to fine-tune using QLORA if you don't want to peform full model fine-tuning.**

## Install necessary packages

In [1]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl rouge_score

## Check high-RAM runtime

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

## Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Authenticate on huggingface

In [4]:
from google.colab import userdata
from huggingface_hub import login

In [5]:
token = userdata.get('huggingface')
login(token)

## Import packages

In [6]:
import os
import pandas as pd
import torch
import transformers
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTConfig,SFTTrainer
import torch
import numpy as np
from torch.nn import CrossEntropyLoss

## Define Models

In [7]:
# Model from Hugging Face hub
base_model = "meta-llama/Llama-2-7b-chat-hf"

# Fine-tuned model
new_model = "/content/drive/MyDrive/HemonChat/fine_tuned_models/Llama-2-7b-chat-hf-Hemonc-v1"

## Load Data

In [8]:
train = pd.read_pickle('/content/drive/MyDrive/HemonChat/data/training_data.pkl')
eval = pd.read_pickle('/content/drive/MyDrive/HemonChat/data/eval_data.pkl')

## Convert train data to LLama input format

Llama template:

```<s> [INST] Instruction [/INST] Model answer </s>```


In [9]:
def transform_to_prompt_template(question, answer):
  return f"[INST] {question.strip()} [/INST] {answer.strip()} </s>"

In [10]:
train['text'] = train.apply(lambda x: transform_to_prompt_template(x['question'], x['answer']), axis=1)
train = train[['text']]

In [11]:
eval['text'] = eval.apply(lambda x: transform_to_prompt_template(x['question'], x['answer']), axis=1)
eval = eval[['text']]

## Create Dataset Object

In [12]:
train_dataset = Dataset.from_pandas(train)
eval_dataset = Dataset.from_pandas(eval)

## Load Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, add_eos_token=True, use_fast=False, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

## Load Model

In [None]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
          base_model,
          quantization_config=bnb_config,
          device_map={"": 0},
          trust_remote_code=True
)
model = prepare_model_for_kbit_training(model)

model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

In [None]:
print(model)

## Load Trainer

In [16]:
# Load LoRA configuration
peft_args = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [None]:
# Set supervised fine-tuning parameters
training_args = SFTConfig(
    dataset_text_field="text",
    max_seq_length=None,
    packing=False,
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    save_total_limit=2,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    evaluation_strategy="steps",
    eval_steps=50,
)

In [None]:
# create trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_args,
    args=training_args,
    processing_class=tokenizer,
)

## Train Model

In [None]:
# Train model
trainer.train()

## Check performance after training

In [None]:
# Final evaluation at the end of training
eval_results = trainer.evaluate()
print("Final eval loss:", eval_results["eval_loss"])

## Save finetuned model

In [None]:
# Save trained model
trainer.model.save_pretrained(new_model)
# Save tokenizer
tokenizer.save_pretrained(new_model)

## Disconnect Runtine

In [None]:
from google.colab import runtime
runtime.unassign()