In [None]:
import os
import torch
try:
  import unsloth
  from unsloth import FastLanguageModel
  from unsloth.chat_templates import get_chat_template, standardize_data_formats, train_on_responses_only
except:
  # must change runtime to nvidia GPU
  !pip install unsloth -q
  import unsloth
  from unsloth import FastLanguageModel
  from unsloth.chat_templates import get_chat_template, standardize_data_formats, train_on_responses_only
!pip install transformers -U -q
from transformers import TrainingArguments
try:
  from trl import SFTTrainer, SFTConfig
except:
  !pip install trl -q
  from trl import SFTTrainer, SFTConfig
import wandb
!pip install -U bitsandbytes -q
from datasets import load_dataset, DatasetDict

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-metadata 1.17.1 requires protobuf<6.0.0,>=4.25.2; python_version >= "3.11", but you have protobuf 3.20.3 which is incompatible.
grpcio-status 1.71.0 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 3.20.3 which is incompatible.
ydf 0.11.0 requires protobuf<6.0.0,>=5.29.1, but you have protobuf 3.20.3 which is incompatible.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2024.12.0 which is incompatible.[0m[31m
[0m🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Step 2.1: Select Base Model & Load with Unsloth
BASE_MODEL_NAME = "unsloth/gemma-3-4b-it"
MAX_SEQ_LENGTH = 2048 # Choose based on model and VRAM (Gemma 3 supports long contexts)
DTYPE = None # None for auto detection. Set to torch.float16 for fp16, torch.bfloat16 for bf16
LOAD_IN_4BIT = True # Use 4-bit quantization as recommended

# --- Dataset Configuration (Updated for your CSV) ---
# Load directly from the uploaded CSV file
DATASET_NAME = "train_df_formatted.csv" # Path to your CSV file
DATASET_TEXT_FIELD = "text" # The column in your CSV containing the pre-formatted text
VALIDATION_SET_SIZE = 0.1 # Use 10% of the data for validation

In [None]:
OUTPUT_DIR = "/content/drive/MyDrive/PROJECTS/Customer Feedback ML Engineer Project/Gemma 3 Parameters/unsloth_gemma3_finetuned_adapters_csv"# Updated output dir name

# Check if the output directory exists
if os.path.exists(OUTPUT_DIR):
  print(f"Output directory '{OUTPUT_DIR}' exists.")
else:
  print(f"Output directory '{OUTPUT_DIR}' does not exist. Creating it...")
  os.makedirs(OUTPUT_DIR, exist_ok=True)  # Create if it doesn't exist

Output directory '/content/drive/MyDrive/PROJECTS/Customer Feedback ML Engineer Project/Gemma 3 Parameters/unsloth_gemma3_finetuned_adapters_csv' exists.


In [None]:
# Step 2.2: Configure LoRA & Training Arguments
LORA_R = 8           # LoRA rank (Gemma 3 notebook example uses 8)
LORA_ALPHA = 8       # LoRA alpha (Gemma 3 notebook example uses 8)
LORA_DROPOUT = 0.0   # LoRA dropout (Gemma 3 notebook example uses 0)
LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj"] # Recommended default

# Training Arguments
NUM_TRAIN_EPOCHS = 1 # Low number for efficiency as requested
PER_DEVICE_TRAIN_BATCH_SIZE = 16 # Adjust based on GPU VRAM
PER_DEVICE_EVAL_BATCH_SIZE = 16  # Adjust based on GPU VRAM
GRADIENT_ACCUMULATION_STEPS = 4 # Adjust based on GPU VRAM and desired effective batch size
LEARNING_RATE = 2e-4 # Gemma 3 notebook example uses 2e-4 (reduce for longer runs)
WEIGHT_DECAY = 0.01
OPTIMIZER = "adamw_8bit" # Unsloth recommends paged_adamw_8bit or adamw_8bit
LR_SCHEDULER_TYPE = "linear" # Or "cosine"
WARMUP_STEPS = 5 # Gemma 3 notebook example uses 5
LOGGING_STEPS = 1 # Log frequently for monitoring
EVAL_STEPS = 50 # Evaluate periodically (adjust based on dataset size)
SAVE_STEPS = 100 # Save checkpoints periodically (adjust based on dataset size)
FP16 = not torch.cuda.is_bf16_supported() # Use fp16 if bf16 not supported
BF16 = torch.cuda.is_bf16_supported()     # Use bf16 if supported
# DATASET_NUM_PROC = 2 # Removed as mapping is simplified

# Weights & Biases Configuration
USE_WANDB = True # Set to True to enable W&B logging
WANDB_PROJECT_NAME = "unsloth-gemma3-finetuning-project-csv" # Your W&B project name

In [None]:
from google.colab import userdata
import os
import wandb  # It's good practice to import wandb if you're using it


# Set environment variable
os.environ['WANDB_API_KEY'] = userdata.get('WANDB_API_KEY')

if USE_WANDB:
    if "WANDB_API_KEY" in os.environ:
        try:
            wandb.login()
            print("Weights & Biases logging enabled.")
        except Exception as e:
            print(f"WARNING: Failed to login to W&B: {e}. Disabling W&B.")
            USE_WANDB = False
            os.environ["WANDB_DISABLED"] = "true"
    else:
        print("WARNING: WANDB_API_KEY environment variable not found. Disabling W&B.")
        USE_WANDB = False
        os.environ["WANDB_DISABLED"] = "true"
else:
    print("Weights & Biases logging disabled.")
    os.environ["WANDB_DISABLED"] = "true"

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnolanrobbins5934[0m ([33mnolanrobbins[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Weights & Biases logging enabled.


In [None]:
# --- Step 2.1: Load Base Model & Tokenizer with Unsloth ---
print(f"Loading base model: {BASE_MODEL_NAME}")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)
print("Model and tokenizer loaded.")

Loading base model: unsloth/gemma-3-4b-it
==((====))==  Unsloth 2025.4.1: Fast Gemma3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

Model and tokenizer loaded.


In [None]:
# --- Apply Gemma 3 Chat Template (Important for correct tokenization) ---
# Although data is pre-formatted, applying the template ensures tokenizer
# knows about special tokens like <start_of_turn>, <end_of_turn>
print("Applying Gemma 3 chat template to tokenizer...")
tokenizer = get_chat_template(
    tokenizer,
    chat_template="gemma-3", # Explicitly set Gemma 3 template
)
# Add pad token if missing (Gemma 3 tokenizer might not have it by default)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})
    model.resize_token_embeddings(len(tokenizer))
    print("Added pad token to tokenizer.")

print("Chat template knowledge applied to tokenizer.")

Applying Gemma 3 chat template to tokenizer...
Chat template knowledge applied to tokenizer.


In [None]:
# --- Step 2.2 (LoRA Configuration integrated with Model Loading) ---
print("Applying LoRA configuration...")
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    target_modules=LORA_TARGET_MODULES,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=42,
    max_seq_length=MAX_SEQ_LENGTH,
    finetune_vision_layers=False,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
)
print("LoRA configuration applied.")
model.print_trainable_parameters()

Applying LoRA configuration...


RuntimeError: Unsloth: You already added LoRA adapters to your model!

Here's why:

1. Total Parameters (all params: 4,304,775,536): This confirms you've loaded the Gemma 3 4B model correctly, as it's in the ~4 billion parameter range.
2. Trainable Parameters (trainable params: 4,696,064): This number is much smaller than the total. This is the key benefit of LoRA! You're only training the small adapter layers (~4.7 million parameters) that Unsloth added, not the entire model.
3. Trainable Percentage (trainable%: 0.1091): This low percentage (~0.11%) directly reflects the efficiency of LoRA. You're updating only a tiny fraction of the model's weights.
In short: Your setup is correctly configured for efficient LoRA fine-tuning. The small number of trainable parameters is precisely what makes this process faster and less memory-intensive than full fine-tuning. You're ready to proceed with the next steps!

In [None]:
# --- Data Loading and Preparation (Updated for CSV) ---
print("Loading and preparing dataset from CSV...")

try:
    # Load the full dataset from the CSV file
    print(f"Loading dataset from: {DATASET_NAME}")
    full_dataset = load_dataset("csv", data_files=DATASET_NAME, split="train")
    print(f"Full dataset loaded. Size: {len(full_dataset)}")

    # Basic check for the text field
    if DATASET_TEXT_FIELD not in full_dataset.column_names:
        raise ValueError(f"Dataset missing required text field: '{DATASET_TEXT_FIELD}'")
    print(f"Found text field '{DATASET_TEXT_FIELD}'.")

    # Split the dataset into training and validation sets
    print(f"Splitting dataset into train/validation (test_size={VALIDATION_SET_SIZE})...")
    split_dataset = full_dataset.train_test_split(test_size=VALIDATION_SET_SIZE, seed=42) # Use seed for reproducibility

    train_dataset = split_dataset["train"]
    valid_dataset = split_dataset["test"] # Renamed 'test' split to 'validation' for clarity in Trainer

    print(f"Dataset split. Train size: {len(train_dataset)}, Valid size: {len(valid_dataset)}")

    # No further formatting needed as the 'text' column is pre-formatted

except Exception as e:
    print(f"ERROR: Failed to load or process dataset '{DATASET_NAME}'.")
    print(f"Please ensure '{DATASET_NAME}' exists and contains a '{DATASET_TEXT_FIELD}' column.")
    print(f"Error details: {e}")
    print("Exiting script.")
    exit(1)

Loading and preparing dataset from CSV...
Loading dataset from: train_df_formatted.csv


Generating train split: 0 examples [00:00, ? examples/s]

Full dataset loaded. Size: 24184
Found text field 'text'.
Splitting dataset into train/validation (test_size=0.1)...
Dataset split. Train size: 21765, Valid size: 2419


In [None]:
# --- Step 2.2: Define Training Arguments ---
print("Defining Training Arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    optim=OPTIMIZER,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_steps=WARMUP_STEPS,
    logging_steps=LOGGING_STEPS,
    eval_strategy="steps",       # Evaluate periodically
    eval_steps=EVAL_STEPS,       # Evaluate every N steps
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    fp16=FP16,
    bf16=BF16,
    seed=3407,
    report_to="wandb" if USE_WANDB else "none",
    push_to_hub=False,
)
print('All set')

Defining Training Arguments...
All set


In [None]:
# --- Step 2.3: Initialize Trainer ---
print("Initializing SFTTrainer...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset, # Pass the validation dataset split
    dataset_text_field=DATASET_TEXT_FIELD, # Use the pre-formatted text field
    max_seq_length=MAX_SEQ_LENGTH,
    args=training_args,
    # packing=True, # Consider packing=True for efficiency if dataset allows
    # No formatting_func needed
)
print("Trainer initialized.")


Initializing SFTTrainer...


Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/21765 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/2419 [00:00<?, ? examples/s]

Trainer initialized.


In [None]:
# --- Apply Response Masking (Train only on Assistant's Replies) ---
# This still works because the markers are expected to be in the 'text' column
print("Applying response-only training mask...")
instruction_part = "<start_of_turn>user\n"
response_part = "<start_of_turn>model\n"
trainer = train_on_responses_only(
    trainer,
    instruction_part=instruction_part,
    response_part=response_part,
)
print("Response mask applied.")


Applying response-only training mask...


Map (num_proc=12):   0%|          | 0/21765 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/2419 [00:00<?, ? examples/s]

Response mask applied.


In [None]:
# --- Step 2.4: Execute Training ---
print("Starting training...")
try:
    train_result = trainer.train()
    print("Training finished.")

    # Log metrics if training completed successfully
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    print("Training metrics saved.")

except Exception as e:
    print(f"ERROR: Training failed.")
    print(f"Error details: {e}")
    if USE_WANDB and wandb.run is not None:
        wandb.finish(exit_code=1, quiet=True)
    exit(1)


Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 21,765 | Num Epochs = 1 | Total steps = 340
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 4 x 1) = 64
 "-____-"     Trainable parameters = 4,696,064/4,000,000,000 (0.12% trained)


Step,Training Loss,Validation Loss
50,0.7326,0.743748
100,0.7403,0.71216
150,0.6924,0.694089
200,0.6934,0.680061
250,0.6898,0.670826
300,0.6674,0.665607


Training finished.
***** train metrics *****
  total_flos               = 137305773GF
  train_loss               =      0.6988
  train_runtime            =  0:28:16.11
  train_samples_per_second =      12.832
  train_steps_per_second   =         0.2
Training metrics saved.


In [None]:
# --- Step 2.5: Save Adapter Weights ---
FINAL_ADAPTER_DIR = os.path.join(OUTPUT_DIR, "final_lora_adapter")
print(f"Saving final LoRA adapter weights to: {FINAL_ADAPTER_DIR}")
model.save_pretrained(FINAL_ADAPTER_DIR)
tokenizer.save_pretrained(FINAL_ADAPTER_DIR) # Save tokenizer with chat template knowledge
print("Final LoRA adapter weights and tokenizer saved successfully.")

# --- Optional: Save the full model if needed ---
# print("Merging LoRA weights and saving full model (optional)...")
# merged_model_dir = os.path.join(OUTPUT_DIR, "final_merged_model")
# model.save_pretrained_merged(merged_model_dir, tokenizer, save_method="merged_16bit") # Or "merged_4bit"
# print(f"Full merged model saved to: {merged_model_dir}")

# --- Clean up ---
print("Fine-tuning process complete.")
if USE_WANDB and wandb.run is not None:
    wandb.finish()

Saving final LoRA adapter weights to: /content/drive/MyDrive/PROJECTS/Customer Feedback ML Engineer Project/Gemma 3 Parameters/unsloth_gemma3_finetuned_adapters_csv/final_lora_adapter
Final LoRA adapter weights and tokenizer saved successfully.
Fine-tuning process complete.


0,1
eval/loss,█▅▄▃▂▂▁▁▁▁
eval/runtime,████▁▁▁▁▁▁
eval/samples_per_second,▁▁▁▁██████
eval/steps_per_second,▇███▁▁▁▁▁▁
train/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇████
train/global_step,▁▁▂▂▂▃▃▄▄▄▄▅▅▅▁▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▆▆▆▆▇▇▇███
train/grad_norm,█▂▂▃▃▃▃▄▄▃▁▁▁▁▁▁▁▂▂▂▁▂▁▂▂▁▂▂▂▂▂▁▂▂▂▂▂▂▂▂
train/learning_rate,▂█████████████████████▂▄▇▆▆▆▆▆▅▅▅▄▃▃▂▂▁▁
train/loss,█▃▃▃▂▂▁▂▂▂▁▁▁▁▁▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.66561
eval/runtime,68.2282
eval/samples_per_second,35.455
eval/steps_per_second,2.228
total_flos,1.4743095164164915e+17
train/epoch,0.99927
train/global_step,340.0
train/grad_norm,0.42056
train/learning_rate,0.0
train/loss,0.6624
