### Fine Tuning LLAMA 3.2 model for customer service dataset

In [1]:
import pandas as pd
from datasets import Dataset
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model
import os

os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load data
df = pd.read_csv('Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')
df = df.head(10)


  from .autonotebook import tqdm as notebook_tqdm
  Referenced from: <0B7EB158-53DC-3403-8A49-22178CAB4612> /Users/koyiljonvaliev/Anaconda/anaconda3/envs/llm_env/lib/python3.10/site-packages/torchvision/image.so
  warn(


In [2]:
def prepare_dataset(df):
    conversations = []
    for _, row in df.iterrows():
        text = f"### User: {row['instruction']}\n### Assistant: {row['response']}\n"
        # Create input-label pairs
        conversations.append({
            "input": text,
            "label": text
        })
    return conversations

dataset = Dataset.from_list(prepare_dataset(df))

In [3]:
# Model setup
model_name = "unsloth/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map={"": device},
    max_memory={"mps": "4GB"},
    torch_dtype=torch.float32
)

# Enable gradient calculation
for param in model.parameters():
    param.requires_grad = True




lora_config = LoraConfig(
    r=4,
    lora_alpha=4,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [4]:


def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        truncation=True,
        max_length=256,
        padding="max_length"
    )
    labels = tokenizer(
        examples["label"],
        truncation=True,
        max_length=256,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

training_args = TrainingArguments(
    output_dir="customer_support_model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
    num_train_epochs=1,
    logging_steps=1,
    save_steps=5,
    fp16=False,
    optim="adamw_torch",
    max_grad_norm=0.3,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)
)




python(41801) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Map: 100%|██████████| 10/10 [00:00<00:00, 35.28 examples/s]


In [5]:
trainer.train()

 10%|█         | 1/10 [02:23<21:29, 143.31s/it]

{'loss': 1.8739, 'grad_norm': 0.20654726028442383, 'learning_rate': 9e-05, 'epoch': 0.1}


 20%|██        | 2/10 [05:05<20:33, 154.19s/it]

{'loss': 2.1081, 'grad_norm': 0.1920340657234192, 'learning_rate': 8e-05, 'epoch': 0.2}


 30%|███       | 3/10 [07:44<18:14, 156.41s/it]

{'loss': 9.5785, 'grad_norm': 12.67580795288086, 'learning_rate': 7e-05, 'epoch': 0.3}


 40%|████      | 4/10 [09:46<14:17, 142.91s/it]

{'loss': 1.899, 'grad_norm': 0.22582542896270752, 'learning_rate': 6e-05, 'epoch': 0.4}


 50%|█████     | 5/10 [11:44<11:09, 133.92s/it]

{'loss': 2.0857, 'grad_norm': 0.24045082926750183, 'learning_rate': 5e-05, 'epoch': 0.5}


 60%|██████    | 6/10 [14:07<09:07, 136.95s/it]

{'loss': 2.3368, 'grad_norm': 0.4427778720855713, 'learning_rate': 4e-05, 'epoch': 0.6}


 70%|███████   | 7/10 [16:23<06:50, 136.88s/it]

{'loss': 10.2044, 'grad_norm': 5.823383808135986, 'learning_rate': 3e-05, 'epoch': 0.7}


 80%|████████  | 8/10 [18:47<04:37, 138.99s/it]

{'loss': 1.7591, 'grad_norm': 0.24742580950260162, 'learning_rate': 2e-05, 'epoch': 0.8}


 90%|█████████ | 9/10 [21:27<02:25, 145.62s/it]

{'loss': 2.4258, 'grad_norm': 10.396319389343262, 'learning_rate': 1e-05, 'epoch': 0.9}


100%|██████████| 10/10 [23:52<00:00, 145.27s/it]

{'loss': 1.6967, 'grad_norm': 0.24008473753929138, 'learning_rate': 0.0, 'epoch': 1.0}


100%|██████████| 10/10 [23:53<00:00, 143.30s/it]

{'train_runtime': 1433.0566, 'train_samples_per_second': 0.007, 'train_steps_per_second': 0.007, 'train_loss': 3.596799600124359, 'epoch': 1.0}





TrainOutput(global_step=10, training_loss=3.596799600124359, metrics={'train_runtime': 1433.0566, 'train_samples_per_second': 0.007, 'train_steps_per_second': 0.007, 'total_flos': 43313576017920.0, 'train_loss': 3.596799600124359, 'epoch': 1.0})

In [6]:
model.save_pretrained("customer_support_model")
tokenizer.save_pretrained("customer_support_model")

('customer_support_model/tokenizer_config.json',
 'customer_support_model/special_tokens_map.json',
 'customer_support_model/tokenizer.json')