# Fine-Tuning LLMs with Hugging Face

## Step 1: Installing and importing the libraries

In [5]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 gradio streamlit

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m225.3/244.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m97.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

In [6]:
!pip install huggingface_hub



In [7]:
import torch
from trl import SFTTrainer
from peft import LoraConfig
from datasets import load_dataset
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline)

## Step 2: Loading the model

In [8]:
llama_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = "aboonaji/llama2finetune-v2",
                                                   quantization_config = BitsAndBytesConfig(load_in_4bit = True, bnb_4bit_compute_dtype = getattr(torch, "float16"), bnb_4bit_quant_type = "nf4"))
llama_model.config.use_cache = False
llama_model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

## Step 3: Loading the tokenizer

In [9]:
llama_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = "aboonaji/llama2finetune-v2", trust_remote_code = True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

## Step 4: Setting the training arguments

In [10]:
training_arguments = TrainingArguments(output_dir = "./results", per_device_train_batch_size = 4, max_steps = 100)

## Step 5: Creating the Supervised Fine-Tuning trainer

In [11]:
llama_sft_trainer = SFTTrainer(model = llama_model,
                               args = training_arguments,
                               train_dataset = load_dataset(path = "aboonaji/wiki_medical_terms_llam2_format", split = "train"),
                               tokenizer = llama_tokenizer,
                               peft_config = LoraConfig(task_type = "CAUSAL_LM", r = 64, lora_alpha = 16, lora_dropout = 0.1),
                               dataset_text_field = "text")

Downloading data:   0%|          | 0.00/54.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6861 [00:00<?, ? examples/s]



Map:   0%|          | 0/6861 [00:00<?, ? examples/s]

## Step 6: Training the model

In [12]:
llama_sft_trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=100, training_loss=1.6531364440917968, metrics={'train_runtime': 1457.1589, 'train_samples_per_second': 0.275, 'train_steps_per_second': 0.069, 'total_flos': 8228119310991360.0, 'train_loss': 1.6531364440917968, 'epoch': 0.06})

## Step 7: Chatting with the model

---



In [13]:
user_prompt = "Please tell me about Bursitis"
text_generation_pipeline = pipeline(task = "text-generation", model = llama_model, tokenizer = llama_tokenizer, max_length = 300)
model_answer = text_generation_pipeline(f"<s>[INST] {user_prompt} [/INST]")
print(model_answer[0]['generated_text'])

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


<s>[INST] Please tell me about Bursitis [/INST]  Bursitis is a condition where the bursae, small fluid-filled sacs that cushion and reduce friction between tendons, ligaments, and bones, become inflamed or irritated. Unterscheidung between bursitis and tendinitis is often difficult because the symptoms are similar.

Bursitis can occur in any bursa in the body, but it is most common in the shoulder, elbow, hip, and knee. The symptoms of bursitis can vary depending on the location of the affected bursa, but they typically include:

* Pain or tenderness in the affected area
* Swelling or redness in the affected area
* Limited mobility or stiffness in the affected joint
* Warmth or heat in the affected area
* In severe cases, fever or chills

Bursitis can be caused by a variety of factors, including:

* Overuse or repetitive motion of a joint
* Trauma or injury to the affected area
* Poor posture or body mechanics
* Infection or infection of the bursa
* Crystal-induced bursitis (e.g., gout

In [14]:
user_prompt = "Please tell me about Fever"
text_generation_pipeline = pipeline(task = "text-generation", model = llama_model, tokenizer = llama_tokenizer, max_length = 350)
model_answer = text_generation_pipeline(f"<s>[INST] {user_prompt} [/INST]")
print(model_answer[0]['generated_text'])

<s>[INST] Please tell me about Fever [/INST]  Fever is a common medical condition characterized by an elevated body temperature, typically above 98. everybody gets fever from time to time, and it is usually caused by a viral or bacterial infection.

Causes of Fever:

1. Viral infections: The most common cause of fever is a viral infection, such as the common cold, flu, or other viral infections.
2. Bacterial infections: Bacterial infections, such as pneumonia, urinary tract infections, or strep throat, can also cause fever.
3. Other causes: Fever can also be caused by other medical conditions, such as:
* Reactions to medication or vaccines
* Cancer
* Autoimmune disorders
* Neurological disorders
* Inflammatory conditions
* Infections of the sinuses, ears, or eyes
* Inflammation of the heart or blood vessels
* Infections of the bone or joint
* Infections of the skin or soft tissue

Symptoms of Fever:

1. Elevated body temperature: A fever is defined as a body temperature above 98.6°F (3

## Step 8: Saving the model weights




In [27]:
llama_sft_trainer.model.save_pretrained("/content/drive/MyDrive/llama2_simplifying_medicine/llama2_simplifying_health_4bit")


## Step 9: Loading the model weights


In [30]:
from peft import PeftModel

new_model="/content/drive/MyDrive/llama2_simplifying_medicine/llama2_simplifying_health_4bit"
model_name = "aboonaji/llama2finetune-v2"
device_map = {"": 0}


In [None]:


# # Reload model in FP16 and merge it with LoRA weights
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map=device_map,
# )
# model = PeftModel.from_pretrained(base_model, new_model)
# model = model.merge_and_unload()

# # Reload tokenizer to save it
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [1]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [2]:
!huggingface-cli login

llama_model.push_to_hub("llama2_simplifying_health_4bit", check_pr=True)

llama_tokenizer.push_to_hub("llama2_simplifying_health_4bit",check_pr=True)



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGr

NameError: name 'llama_model' is not defined

## Step 8: User Interface

UI using Streamlit

In [None]:
import streamlit as st
from streamlit_chat import message as st_message

In [None]:
if "history" not in st.session_state:
    st.session_state.history = []

st.title("Simplifying Medicine")


def generate_answer():
    tokenizer, model = llama_tokenizer, llama_model
    user_message = st.session_state.input_text
    inputs = tokenizer(st.session_state.input_text, return_tensors="pt")
    result = model.generate(**inputs)
    message_bot = tokenizer.decode(
        result[0], skip_special_tokens=True
    )  # .replace("<s>", "").replace("</s>", "")

    st.session_state.history.append({"message": user_message, "is_user": True})
    st.session_state.history.append({"message": message_bot, "is_user": False})


st.text_input("Ask something", key="input_text", on_change=generate_answer)

for i, chat in enumerate(st.session_state.history):
    st_message(**chat, key=str(i)) #unpacking

UI using Gradio

In [None]:
SYSTEM_PROMPT = """<s>[INST] <<SYS>>
You are a helpful bot. Your answers are clear and concise.
<</SYS>>

"""

# Formatting function for message and history
def format_message(message: str, history: list, memory_limit: int = 3) -> str:
    """
    Formats the message and history for the Llama model.

    Parameters:
        message (str): Current message to send.
        history (list): Past conversation history.
        memory_limit (int): Limit on how many past interactions to consider.

    Returns:
        str: Formatted message string



    """
    # always keep len(history) <= memory_limit
    if len(history) > memory_limit:
        history = history[-memory_limit:]

    if len(history) == 0:
        return SYSTEM_PROMPT + f"{message} [/INST]"

    formatted_message = SYSTEM_PROMPT + f"{history[0][0]} [/INST] {history[0][1]} </s>"

    # Handle conversation history
    for user_msg, model_answer in history[1:]:
        formatted_message += f"<s>[INST] {user_msg} [/INST] {model_answer} </s>"

    # Handle the current message
    formatted_message += f"<s>[INST] {message} [/INST]"

    return formatted_message

In [None]:
# Generate a response from the Llama model
def get_llama_response(message: str, history: list) -> str:
    """
    Generates a conversational response from the Llama model.

    Parameters:
        message (str): User's input message.
        history (list): Past conversation history.

    Returns:
        str: Generated response from the Llama model.


    Implementation
        user_prompt = "Please tell me about Fever"
        text_generation_pipeline = pipeline(task = "text-generation", model = llama_model, tokenizer = llama_tokenizer, max_length = 150)
        model_answer = text_generation_pipeline(f"<s>[INST] {user_prompt} [/INST]")
        print(model_answer[0]['generated_text'])

    """
    query = format_message(message, history)
    response = ""

    sequences = pipeline(task = "text-generation", model = llama_model, tokenizer = llama_tokenizer, max_length = 150)
    model_answer = sequences(f"<s>[INST] {user_prompt} [/INST]")
    generated_text = model_answer[0]['generated_text']
    response = generated_text[len(query):]  # Remove the prompt from the output


    print("Chatbot:", response.strip())
    return response.strip()

In [None]:
!pip install gradio

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [None]:
import gradio as gr

gr.ChatInterface(get_llama_response).launch()


ModuleNotFoundError: No module named 'gradio'

## Step 9: Save the model

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
llama_model2 = llama_model

In [None]:
# Step 1: Define the save path on Google Drive
save_directory = '/content/drive/My Drive/llama_finetuned_model'

# Step 2: Save the model and tokenizer
llama_model.save_pretrained(save_directory)
llama_tokenizer.save_pretrained(save_directory)


NotImplementedError: You are calling `save_pretrained` on a 4-bit converted model. This is currently not supported