#**Finetune LLM on Custom Data, Run locally and Deploy!** -Rinchen


In [1]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
#@title **Import nessary libraries**
from unsloth import FastLanguageModel
from datasets import Dataset, DatasetDict
import torch
import random

In [None]:
#@title **Choose your LLM of Choice**
max_seq_length = 2048
dtype = None # None for auto detection
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",                    #4.1GB
    "unsloth/tinyllama-it-bnb-4bit",                                #0.8GB
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-3-8b-Instruct-bnb-4bit",                         #5.7GB
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",                               #3.9GB??
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",                      #2.4GB
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit", # Instruct                      #5.5GB, 50m trainable(r=8)
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit", # Instruct                      #2.2GB, 40m trainable(r=32)
]

model, tokenizer = FastLanguageModel.from_pretrained(
    #smaller model may be better when there is a lack of data, also it a lot faster to test and run
    #use gemma 2b till 10k-20k data points are available
    model_name = "unsloth/gemma-2b-it-bnb-4bit",
    #use gemma 7b and llama 8b when more data and better hardware is available
    #model_name = "unsloth/gemma-7b-it-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

#gemma 2b, avg_vram=3.2gb, 1%max_vram=5.6gb (kinda)

In [4]:
#@title **Combine Jsonls**
#(loads questions, answers and instruction files)
import json

instructions=[]
inputs=[]
outputs=[]

with open("instructions.jsonl", "r") as file:
  for line in file:
    instructions.append(json.loads(line)["text"])
with open("queries.jsonl", "r") as file:
  for line in file:
    inputs.append(json.loads(line)["text"])
with open("answers.jsonl", "r") as file:
  for line in file:
    outputs.append(json.loads(line)["text"])

alpaca_prompt = """{}

### query:
{}

### answer:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instruction = examples["instruction"]
    input       = examples["input"] ###
    output      = examples["output"]
    texts = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
    return { "text" : texts, }
pass

combined_data = [{
    "instruction" : inst,
    "input"       : inp,
    "output"      : out,
}for inst,inp,out in zip(instructions,inputs,outputs)]

#dataset = Dataset. from_list(combined_data)
#dataset = dataset.map(formatting_prompts_func)

In [None]:
#@title **Impliment Test and Train sets**
# Validation Set and Early Stopping
# Split data into training and validation sets
random.seed(3407)
random.shuffle(combined_data)
split_idx = int(0.9 * len(combined_data))  # 90-10 split
train_data = combined_data[:split_idx]
val_data = combined_data[split_idx:]

# Create datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

train_dataset = train_dataset.map(formatting_prompts_func)
val_dataset = val_dataset.map(formatting_prompts_func)

datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

In [None]:
#@title **Add LoRA adapters**
#AutoModelForCausalLM can be used but FastLanguageModel is way faster
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, #8, 16, 32, 64, 128; this is the number of layers what will be added and trained
    #64 is ideal but computationaly expensive, and needs a lot of data to not overfit
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0.2, # 0 is optimized, but add for regularization, 0 to 0.4
    bias = "none",      # "none" is optimized, "learnable", "fixed"

    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [None]:
#@title **Set Training Hyperparameters**
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,   #simulated batch_size=(batch_size X gradient_accumulation)
        warmup_ratio = 0.1,      #5-10% of epochs
        num_train_epochs = 4,
        #warmup_steps=5,      #5-10% of max_steps
        #max_steps=50,
        learning_rate=1e-4,  # Reduced learning rate(1e-4 to 5e-4)
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,   #0.01 to 0.03
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        evaluation_strategy="steps",
        eval_steps=10,  # Evaluate every 10 steps
        save_strategy="steps",
        save_steps=10,
        load_best_model_at_end=True,
        report_to="none"
    ),
)

In [None]:
#@title **Show current stats**
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Total VRAM = {max_memory} GB.")
print(f"{start_gpu_memory} GB VRAM Occupied.")
print(f"Number of Datapoints: {len(instructions)}")
print(f"Sample instruction: {instructions[0]}")
print(f"Sample input:       {inputs[0]}")
print(f"Sample output:      {outputs[0]}")

In [None]:
#@title **Train your model**
trainer_stats = trainer.train() #try to get as close as possiable to 1.0 as a rule of thumb 
#error may be diffrent over 2 itertions even if all hyperparameters are the same

In [None]:
#@title Show final stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
print(trainer_stats)

In [9]:
#@title **Clear cache (VRAM)**
torch.cuda.empty_cache() #works for 2b not for 7b.. idk why

In [None]:
#@title **Test the model on command line**
from transformers import pipeline
from unsloth import FastLanguageModel

# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# Define a function to interactively ask questions and get answers
def chat_with_model():
    while True:
        user_input = input("Rinchen: ")
        if user_input.lower() in ['exit', 'quit']:
            print("Vinbot: Goodbye!")
            break
        inputs = tokenizer(
            ["You are a customer support agent from Schneider Electric, please provide customer support regarding the VinPlus software.\n### query:\n{}\n### answer:\n{}".format(user_input, "")],
            return_tensors="pt"
        ).to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=512, use_cache=True)
        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        # Extract only the answer part
        answer = decoded_output.split("### answer:")[1].strip()
        print("VinBot: ", answer)

# Start the interactive chat
chat_with_model()

In [None]:
#@title Save trained model locally (LoRA adapters)
#simple local saving (save only LoRA adaptors)
#model.save_pretrained("lora_model")
#tokenizer.save_pretrained("lora_model")
model.save_pretrained("C:/Users/RNRaku/Desktop/VinBot/lora_model")
tokenizer.save_pretrained("C:/Users/RNRaku/Desktop/VinBot/lora_model")

# **Deployment**
Load pretained modeland deploy (do this in a seperate notebook ideally)

In [11]:
%%capture
!pip install streamlit ngrok pyngrok
!npm install localtunnel

In [None]:
#@title **Check IP for tunnel password**
!curl https://loca.lt/mytunnelpassword

In [None]:
#@title **Chat with Trained model**

with open('app.py', 'w') as f:
    f.write("""
import streamlit as st
from unsloth import FastLanguageModel

max_seq_length = 2048
dtype = None  # None for auto detection
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

# Define a custom device map for GPU only
device_map = {
    "": "cuda:0",  # Ensure everything is on GPU
}

@st.cache_resource
def load_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="lora_model",
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        device_map=device_map
    )
    FastLanguageModel.for_inference(model)
    return model, tokenizer

# Load the model and tokenizer with caching
if 'model' not in st.session_state or 'tokenizer' not in st.session_state:
    st.session_state['model'], st.session_state['tokenizer'] = load_model()

model = st.session_state['model']
tokenizer = st.session_state['tokenizer']

# Define a function to generate a response
def generate_response(query):
    inputs = tokenizer(
        f\"\"\"You are a customer support agent from Schneider Electric, please provide customer support regarding the VinPlus software.\n
        ### query:
        {query}
        ### answer:
        \"\"\",
        return_tensors="pt"
    ).to(model.device)
    outputs = model.generate(inputs.input_ids, max_new_tokens=512, use_cache=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### answer:")[1].strip()

st.title("VinBOT")
st.write("Ask me about VinPlus!")

# Initialize the conversation history
if 'messages' not in st.session_state:
    st.session_state['messages'] = []

# Display chat messages from history on app rerun
for message in st.session_state['messages']:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Accept user input
if prompt := st.chat_input("Rinchen:"):
    # Add user message to chat history
    st.session_state['messages'].append({"role": "user", "content": prompt})
    # Display user message in chat message container
    with st.chat_message("user"):
        st.markdown(prompt)

    # Generate and display assistant response
    response = generate_response(prompt)
    with st.chat_message("assistant"):
        st.markdown(response)

    # Add assistant response to chat history
    st.session_state['messages'].append({"role": "assistant", "content": response})

    """)

# Run the Streamlit app
!streamlit run app.py & npx localtunnel --port 8501