# Customer Support LLM Fine-tuning - Google Colab

This notebook trains a customer support LLM using QLoRA (4-bit quantization) on Google Colab with GPU support.

**Model: Mistral-7B-Instruct**

## Setup

1. Upload your training data to Colab
2. Run all cells
3. Download the trained adapter when complete

## 1. Install Dependencies

In [11]:
!pip install -q torch transformers accelerate peft datasets bitsandbytes sentencepiece pydantic

## 2. Configuration

In [None]:
# Model configuration
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"  # Mistral-7B-Instruct

# Data paths (upload your files to Colab)
TRAIN_FILE = "/content/train.jsonl"  # Upload your train.jsonl here
VAL_FILE = "/content/val.jsonl"  # Upload your val.jsonl here (optional)

# LoRA configuration
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# Training hyperparameters
BATCH_SIZE = 2  # Can be higher on GPU
GRADIENT_ACCUMULATION_STEPS = 4
NUM_EPOCHS = 5
LEARNING_RATE = 1e-4
MAX_SEQ_LENGTH = 1024
SAVE_STEPS = 25

# Output
OUTPUT_DIR = "/content/outputs/customer_support_adapter"

print("Configuration:")
print(f"  Model: {MODEL_ID}")
print(f"  Training file: {TRAIN_FILE}")
print(f"  Output: {OUTPUT_DIR}")

Training Configuration
Model: Qwen/Qwen2.5-7B-Instruct
Batch size: 2
Epochs: 5
Output directory: /content/outputs/customer_support_adapter

⚠ Next: Mount Google Drive and configure data file paths in the next cell


## 3. Upload Training Data

Upload your `train.jsonl` and `val.jsonl` files using the file browser on the left, or use the code below:

In [None]:
from google.colab import files
import os

# Create upload directory
os.makedirs("/content", exist_ok=True)

print("Please upload your training files:")
print("1. train.jsonl")
print("2. val.jsonl (optional)")
print("\nUse the file browser on the left, or run:")
print("  files.upload()")

# Uncomment to use file uploader:
# uploaded = files.upload()
# for filename in uploaded.keys():
#     print(f'Uploaded {filename}')

Mounting Google Drive...


ValueError: mount failed

## 4. Load Model with QLoRA (4-bit Quantization)

## 3.5. Load Model and Tokenizer

Mistral doesn't require special authentication - we can load it directly.

In [None]:
# Mistral doesn't require authentication - we can load directly
print("✓ Ready to load Mistral model (no authentication needed)")

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from peft import prepare_model_for_kbit_training

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading model with 4-bit quantization (QLoRA)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

print("✓ Model loaded with 4-bit quantization")

Using device: cuda
GPU: Tesla T4
GPU Memory: 15.83 GB

Loading tokenizer...
Loading model with 4-bit quantization (QLoRA)...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

✓ Model loaded with 4-bit quantization


In [None]:
from peft import LoraConfig, get_peft_model

# Auto-detect target modules for Qwen2.5
target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# Try to auto-detect, fallback to common ones
try:
    # Check model architecture
    if hasattr(model, "config") and hasattr(model.config, "architectures"):
        arch = model.config.architectures[0] if model.config.architectures else ""
        if "Qwen" in arch:
            target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
        elif "Mistral" in arch or "Llama" in arch:
            target_modules = ["q_proj", "v_proj", "k_proj", "o_proj"]
except:
    pass

print(f"LoRA target modules: {target_modules}")

# Configure LoRA
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=target_modules,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Print trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"Trainable params: {trainable_params:,} || "
        f"All params: {all_param:,} || "
        f"Trainable%: {100 * trainable_params / all_param:.4f}%"
    )

print_trainable_parameters(model)

LoRA target modules: ['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
Trainable params: 40,370,176 || All params: 4,393,342,464 || Trainable%: 0.9189%


In [None]:
from datasets import load_dataset

def format_messages(messages, system_prompt=None):
    """Format messages for training."""
    if tokenizer.chat_template:
        chat_messages = []
        if system_prompt:
            chat_messages.append({"role": "system", "content": system_prompt})
        chat_messages.extend([{"role": msg["role"], "content": msg["content"]} for msg in messages])
        return tokenizer.apply_chat_template(
            chat_messages,
            tokenize=False,
            add_generation_prompt=False
        )
    else:
        # Fallback formatting
        parts = []
        if system_prompt:
            parts.append(f"System: {system_prompt}")
        for msg in messages:
            parts.append(f"{msg['role'].capitalize()}: {msg['content']}")
        return "\n".join(parts)

def tokenize_function(examples, tokenizer, max_length):
    """Tokenize examples for training."""
    texts = []
    for i in range(len(examples["messages"])):
        messages = examples["messages"][i]
        system_prompt = examples.get("system_prompt", [None] * len(examples["messages"]))[i]
        text = format_messages(messages, system_prompt)
        texts.append(text)
    
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors=None
    )
    
    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

# Load datasets
print("Loading training dataset...")
train_dataset = load_dataset("json", data_files=TRAIN_FILE, split="train")
print(f"Loaded {len(train_dataset)} training examples")

if os.path.exists(VAL_FILE):
    print("Loading validation dataset...")
    val_dataset = load_dataset("json", data_files=VAL_FILE, split="train")
    print(f"Loaded {len(val_dataset)} validation examples")
else:
    print("No validation file found, skipping validation")
    val_dataset = None

# Tokenize
print("\nTokenizing datasets...")
tokenized_train = train_dataset.map(
    lambda examples: tokenize_function(examples, tokenizer, MAX_SEQ_LENGTH),
    batched=True,
    remove_columns=train_dataset.column_names
)

if val_dataset:
    tokenized_val = val_dataset.map(
        lambda examples: tokenize_function(examples, tokenizer, MAX_SEQ_LENGTH),
        batched=True,
        remove_columns=val_dataset.column_names
    )
else:
    tokenized_val = None

print("✓ Datasets prepared")

Checking Data Files...


FileNotFoundError: 
❌ ERROR: Training file not found: /content/data/splits/train.jsonl
Please upload train.jsonl using one of the methods in the previous cell.
See DATA_UPLOAD_GUIDE.md for detailed instructions.

In [None]:
from transformers import (
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# Training arguments
eval_steps = SAVE_STEPS if tokenized_val else None

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=True,  # Use FP16 on GPU
    logging_steps=10,
    save_steps=SAVE_STEPS,
    save_total_limit=3,
    report_to="none",
    remove_unused_columns=False,
    eval_strategy="steps" if tokenized_val else "no",
    eval_steps=eval_steps,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=50,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, not masked LM
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

print("✓ Trainer configured")

## 8. Train Model

In [None]:
print("Starting training...")
print(f"Total steps: ~{len(tokenized_train) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS) * NUM_EPOCHS}")
print(f"Estimated time: ~2-4 hours on T4 GPU\n")

trainer.train()

print("\n✓ Training complete!")

## 9. Save Final Model

In [None]:
print(f"Saving final model to {OUTPUT_DIR}...")
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

print("✓ Model saved")
print(f"\nAdapter saved to: {OUTPUT_DIR}")

In [None]:
print(f"Saving final model to {OUTPUT_DIR}...")
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

print("✓ Model saved")
print(f"\nAdapter saved to: {OUTPUT_DIR}")

## 10. Download Adapter

In [None]:
import shutil
from google.colab import files

# Create zip file
zip_path = "/content/customer_support_adapter.zip"
shutil.make_archive(
    zip_path.replace(".zip", ""),
    "zip",
    OUTPUT_DIR
)

print(f"Adapter zipped to: {zip_path}")
print("\nDownloading adapter...")
files.download(zip_path)

print("\n✓ Download complete!")
print("\nTo use this adapter:")
print(f"1. Extract the zip file")
print(f"2. Copy the adapter to: outputs/run_002_qwen7b/")
print(f"3. Update inference server: export ADAPTER_DIR=outputs/run_002_qwen7b")