# C++ Code Error Classifier - LLM Fine-Tuning with QLoRA

This notebook fine-tunes a Large Language Model using **QLoRA (4-bit quantization + LoRA)** to classify C++ code errors based on the difference between "Original" (buggy) and "Fixed" code.

**Environment:** Google Colab Free Tier (T4 GPU, ~15GB VRAM)

**Technique:** 
- 4-bit quantization with `bitsandbytes`
- LoRA adapters via `peft`
- `unsloth` for optimized training speed

**Task:** Given a code diff, predict the error category:
- Memory Management
- Invalid Access
- Uninitialized
- Concurrency
- Logic Error
- Resource Leak
- Security/Portability

## 1. Setup & Installation

Install all required dependencies optimized for Google Colab. Using Unsloth's recommended installation for maximum compatibility and speed.

In [None]:
%%capture
# Install Unsloth (optimized for Colab) - this handles all dependencies
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install additional required packages
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# Install visualization and ML utilities
!pip install scikit-learn matplotlib seaborn

## 2. Import Libraries

In [None]:
import os
import json
import glob
import random
from typing import List, Dict, Tuple

import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer

# Set seeds for reproducibility
random.seed(42)
torch.manual_seed(42)

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 3. Configuration & Constants

Define the error categories and label mappings.

In [None]:
# =============================================================================
# CONFIGURATION - Adjust these paths and parameters as needed
# =============================================================================

# Path to folder containing .json files (update this for your data location)
DATA_FOLDER = "/content/drive/MyDrive/fixmycodedb_export"  # Change to your path

# Error category mappings (snake_case -> Human Readable)
LABEL_MAPPING = {
    "memory_management": "Memory Management",
    "invalid_access": "Invalid Access",
    "uninitialized": "Uninitialized",
    "concurrency": "Concurrency",
    "logic_error": "Logic Error",
    "resource_leak": "Resource Leak",
    "security_portability": "Security/Portability",
    "code_quality_performance": "Code Quality/Performance"
}

# All valid labels (for classification)
VALID_LABELS = list(LABEL_MAPPING.values())

# Model configuration
MODEL_NAME = "unsloth/llama-3-8b-Instruct-bnb-4bit"  # or "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
MAX_SEQ_LENGTH = 2048

# Training configuration (optimized for Colab Free Tier)
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
MAX_STEPS = 60  # Adjust based on dataset size
LEARNING_RATE = 2e-4

# LoRA configuration
LORA_R = 16
LORA_ALPHA = 16
LORA_DROPOUT = 0

print("Configuration loaded successfully!")
print(f"Target labels: {VALID_LABELS}")

## 4. Data Loading Function

Load all JSON files from the specified directory and extract the relevant fields.

In [None]:
def load_json_files(folder_path: str) -> List[Dict]:
    """
    Load all .json files from a directory and extract relevant fields.
    
    Args:
        folder_path: Path to the folder containing .json files
        
    Returns:
        List of dictionaries with code_original, code_fixed, and labels
    """
    data = []
    json_files = glob.glob(os.path.join(folder_path, "*.json"))
    
    print(f"Found {len(json_files)} JSON files in {folder_path}")
    
    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                record = json.load(f)
            
            # Extract required fields
            code_original = record.get("code_original", "")
            code_fixed = record.get("code_fixed", "")
            labels_groups = record.get("labels", {}).get("groups", {})
            
            # Skip if essential fields are missing
            if not code_original or not labels_groups:
                continue
            
            data.append({
                "code_original": code_original,
                "code_fixed": code_fixed if code_fixed else code_original,  # Fallback
                "labels_groups": labels_groups,
                "file_name": os.path.basename(file_path)
            })
            
        except (json.JSONDecodeError, IOError) as e:
            print(f"Error reading {file_path}: {e}")
            continue
    
    print(f"Successfully loaded {len(data)} records")
    return data


# For testing without actual files, create sample data
def create_sample_data() -> List[Dict]:
    """Create sample data for testing the pipeline."""
    samples = [
        {
            "code_original": "int* ptr = malloc(sizeof(int));\n*ptr = 5;\nreturn 0;",
            "code_fixed": "int* ptr = malloc(sizeof(int));\n*ptr = 5;\nfree(ptr);\nreturn 0;",
            "labels_groups": {"memory_management": False, "resource_leak": True, "logic_error": False},
        },
        {
            "code_original": "int arr[5];\nfor(int i=0; i<=5; i++) arr[i] = i;",
            "code_fixed": "int arr[5];\nfor(int i=0; i<5; i++) arr[i] = i;",
            "labels_groups": {"invalid_access": True, "logic_error": False, "memory_management": False},
        },
        {
            "code_original": "int x;\nprintf(\"%d\", x);",
            "code_fixed": "int x = 0;\nprintf(\"%d\", x);",
            "labels_groups": {"uninitialized": True, "logic_error": False},
        },
        {
            "code_original": "if (a = 5) { doSomething(); }",
            "code_fixed": "if (a == 5) { doSomething(); }",
            "labels_groups": {"logic_error": True, "memory_management": False},
        },
        {
            "code_original": "pthread_mutex_lock(&m);\nif(cond) return;\npthread_mutex_unlock(&m);",
            "code_fixed": "pthread_mutex_lock(&m);\nif(cond) { pthread_mutex_unlock(&m); return; }\npthread_mutex_unlock(&m);",
            "labels_groups": {"concurrency": True, "resource_leak": False},
        },
    ]
    # Duplicate samples to have more data for testing
    return samples * 20  # 100 samples for testing


# Load data - uncomment the appropriate line
raw_data = load_json_files('./exported')  # Use this for real data
# raw_data = create_sample_data()  # Use this for testing

print(f"\nTotal samples: {len(raw_data)}")

## 5. Data Preprocessing & Label Extraction

Extract the active label from each record and convert to human-readable format.

In [None]:
def extract_active_label(labels_groups: Dict) -> str:
    """
    Extract the active label (the key where value is True).
    Returns the human-readable label name.
    
    Args:
        labels_groups: Dictionary of label_name -> boolean
        
    Returns:
        Human-readable label string or "Unknown" if none found
    """
    for key, value in labels_groups.items():
        if value is True and key in LABEL_MAPPING:
            return LABEL_MAPPING[key]
    return "Unknown"


def preprocess_data(raw_data: List[Dict]) -> List[Dict]:
    """
    Preprocess raw data: extract labels, filter invalid entries.
    
    Args:
        raw_data: List of raw records from JSON files
        
    Returns:
        List of processed records with extracted labels
    """
    processed = []
    label_counts = {}
    
    for record in raw_data:
        label = extract_active_label(record["labels_groups"])
        
        # Skip unknown labels
        if label == "Unknown":
            continue
        
        processed.append({
            "code_original": record["code_original"],
            "code_fixed": record["code_fixed"],
            "label": label
        })
        
        # Track label distribution
        label_counts[label] = label_counts.get(label, 0) + 1
    
    print("\nLabel Distribution:")
    for label, count in sorted(label_counts.items()):
        print(f"  {label}: {count}")
    
    return processed


# Preprocess the data
processed_data = preprocess_data(raw_data)
print(f"\nProcessed samples: {len(processed_data)}")

## 6. Prompt Formatting (Alpaca Style)

Format the data into instruction-response pairs suitable for supervised fine-tuning.

In [None]:
# Alpaca-style prompt template with few-shot context
ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a C++ code error classifier. Analyze the difference between the original (buggy) code and the fixed code, then classify the error into exactly ONE of these categories:
- Memory Management: Issues with malloc/free, new/delete, memory allocation
- Invalid Access: Array out of bounds, null pointer dereference, invalid memory access
- Uninitialized: Using uninitialized variables
- Concurrency: Race conditions, deadlocks, thread safety issues
- Logic Error: Incorrect conditions, off-by-one errors, wrong operators
- Resource Leak: Unclosed files, sockets, unreleased resources
- Security/Portability: Security vulnerabilities, platform-specific issues
- Code Quality/Performance: Style issues, inefficient code

Respond with ONLY the category name, nothing else.

### Input:
Original Code:
```cpp
{code_original}
```

Fixed Code:
```cpp
{code_fixed}
```

### Response:
{label}"""


def format_prompt(sample: Dict, include_response: bool = True) -> str:
    """
    Format a sample into Alpaca-style prompt.
    
    Args:
        sample: Dictionary with code_original, code_fixed, and label
        include_response: Whether to include the label (for training) or not (for inference)
        
    Returns:
        Formatted prompt string
    """
    prompt = ALPACA_PROMPT_TEMPLATE.format(
        code_original=sample["code_original"].strip(),
        code_fixed=sample["code_fixed"].strip(),
        label=sample["label"] if include_response else ""
    )
    return prompt


def format_for_inference(sample: Dict) -> str:
    """Format prompt for inference (without the response)."""
    return format_prompt(sample, include_response=False).rstrip()


# Test the formatting
print("Example formatted prompt:")
print("=" * 80)
example_prompt = format_prompt(processed_data[0])
print(example_prompt[:1500] + "..." if len(example_prompt) > 1500 else example_prompt)

## 7. Train/Test Split

Split the data with stratification to ensure balanced label representation.

In [None]:
# Extract labels for stratification
labels = [sample["label"] for sample in processed_data]

# Split 80% train, 20% test with stratification
train_data, test_data = train_test_split(
    processed_data,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

print(f"Training samples: {len(train_data)}")
print(f"Testing samples: {len(test_data)}")

# Verify stratification
train_labels = [s["label"] for s in train_data]
test_labels = [s["label"] for s in test_data]

print("\nTraining set distribution:")
for label in set(train_labels):
    print(f"  {label}: {train_labels.count(label)}")

print("\nTest set distribution:")
for label in set(test_labels):
    print(f"  {label}: {test_labels.count(label)}")

## 8. Load Base Model with 4-bit Quantization

Load the pre-quantized model using Unsloth for optimized memory usage and speed.

In [None]:
# Load model with 4-bit quantization
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,  # Auto-detect (float16 for T4)
    load_in_4bit=True,  # QLoRA 4-bit quantization
)

print(f"Model loaded: {MODEL_NAME}")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")

# Check memory usage
if torch.cuda.is_available():
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")

## 9. Configure LoRA Adapters

Apply LoRA configuration to enable efficient fine-tuning with minimal trainable parameters.

In [None]:
# Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,  # Rank - higher = more parameters, more memory
    lora_alpha=LORA_ALPHA,  # Scaling factor
    lora_dropout=LORA_DROPOUT,  # No dropout for stability
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # Attention modules
        "gate_proj", "up_proj", "down_proj",  # MLP modules
    ],
    bias="none",
    use_gradient_checkpointing="unsloth",  # Unsloth optimized checkpointing
    random_state=42,
)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
print(f"Total parameters: {total_params:,}")

## 10. Prepare Dataset for Training

Convert the formatted training data into a Hugging Face Dataset object.

In [None]:
# Format training data
formatted_train_data = [
    {"text": format_prompt(sample)} for sample in train_data
]

# Create Hugging Face Dataset
train_dataset = Dataset.from_list(formatted_train_data)

print(f"Training dataset size: {len(train_dataset)}")
print("\nSample formatted text (first 500 chars):")
print(train_dataset[0]["text"][:500] + "...")

## 10.5 Baseline Evaluation (Before Fine-Tuning)

Run inference on the test set using the base model BEFORE training to establish a baseline for comparison.

In [None]:
# Set model to inference mode for baseline evaluation
FastLanguageModel.for_inference(model)

def generate_prediction_baseline(sample: Dict) -> str:
    """
    Generate a prediction for a single sample (baseline - before fine-tuning).
    """
    # Format the prompt (without response)
    prompt = format_for_inference(sample)
    
    # Tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_SEQ_LENGTH - 50
    ).to("cuda" if torch.cuda.is_available() else "cpu")
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            use_cache=True,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    # Decode only the new tokens
    generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return generated.strip()


# Run baseline inference on test set
print("Running BASELINE inference (before fine-tuning)...")
print("=" * 50)

baseline_predictions = []
baseline_true_labels = []

for i, sample in enumerate(test_data):
    pred = generate_prediction_baseline(sample)
    baseline_predictions.append(pred)
    baseline_true_labels.append(sample["label"])
    
    if (i + 1) % 5 == 0:
        print(f"Processed {i + 1}/{len(test_data)} samples...")

# Parse baseline predictions
baseline_parsed = [parse_prediction(p) for p in baseline_predictions]

# Calculate baseline accuracy
baseline_correct = sum(1 for pred, true in zip(baseline_parsed, baseline_true_labels) if pred == true)
baseline_total = len(baseline_true_labels)
baseline_accuracy = (baseline_correct / baseline_total) * 100

print(f"\n{'=' * 50}")
print(f"BASELINE ACCURACY (Before Fine-Tuning): {baseline_accuracy:.2f}%")
print(f"Correct: {baseline_correct} / {baseline_total}")
print(f"{'=' * 50}")

# Show some baseline predictions
print("\nSample baseline predictions:")
for i in range(min(5, len(test_data))):
    print(f"True: {baseline_true_labels[i]:<25} | Pred: {baseline_parsed[i]}")

# Switch back to training mode for fine-tuning
model.train()
print("\nModel set back to training mode. Proceeding to fine-tuning...")

## 11. Configure SFTTrainer

Set up the Supervised Fine-tuning Trainer with hyperparameters optimized for Colab Free Tier.

In [None]:
# Configure training arguments
training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=5,
    max_steps=MAX_STEPS,  # Or use num_train_epochs=1 for full epoch
    learning_rate=LEARNING_RATE,
    fp16=not torch.cuda.is_bf16_supported(),  # Use fp16 if bf16 not supported
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=1,
    optim="adamw_8bit",  # Memory-efficient optimizer
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=42,
    report_to="none",  # Disable wandb/tensorboard for Colab
)

# Create trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    dataset_num_proc=2,
    packing=False,  # Disable packing for classification tasks
    args=training_args,
)

print("SFTTrainer configured successfully!")
print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"Max training steps: {MAX_STEPS}")

## 12. Run Training

Execute the fine-tuning process and monitor memory usage.

In [None]:
# Record starting memory
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()
    start_memory = torch.cuda.memory_allocated()

print("Starting training...")
print("=" * 50)

# Train the model
trainer_stats = trainer.train()

# Report memory usage
if torch.cuda.is_available():
    peak_memory = torch.cuda.max_memory_allocated() / 1e9
    end_memory = torch.cuda.memory_allocated() / 1e9
    print("=" * 50)
    print(f"Training completed!")
    print(f"Peak GPU memory used: {peak_memory:.2f} GB")
    print(f"Current GPU memory: {end_memory:.2f} GB")
    print(f"Training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")

## 13. Inference on Test Set

Run predictions on the test set using the fine-tuned model.

In [None]:
# Set model to inference mode
FastLanguageModel.for_inference(model)

def generate_prediction(sample: Dict) -> str:
    """
    Generate a prediction for a single sample.
    
    Args:
        sample: Dictionary with code_original and code_fixed
        
    Returns:
        Model's predicted label
    """
    # Format the prompt (without response)
    prompt = format_for_inference(sample)
    
    # Tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_SEQ_LENGTH - 50  # Leave room for generation
    ).to("cuda")
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            use_cache=True,
            do_sample=False,  # Greedy decoding for classification
            pad_token_id=tokenizer.eos_token_id,
        )
    
    # Decode only the new tokens
    generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return generated.strip()


# Run inference on test set
print("Running inference on test set...")
print("=" * 50)

predictions = []
true_labels = []

for i, sample in enumerate(test_data):
    pred = generate_prediction(sample)
    predictions.append(pred)
    true_labels.append(sample["label"])
    
    if (i + 1) % 5 == 0:
        print(f"Processed {i + 1}/{len(test_data)} samples...")

print(f"\nInference complete! Processed {len(test_data)} samples.")

## 14. Parse Predictions & Calculate Accuracy

Extract and normalize predictions, then calculate classification metrics.

In [None]:
def parse_prediction(raw_prediction: str) -> str:
    """
    Parse and normalize the model's prediction to match valid labels.
    
    Args:
        raw_prediction: Raw model output string
        
    Returns:
        Normalized label or "Unknown" if not recognized
    """
    # Clean up the prediction
    pred = raw_prediction.strip().split("\n")[0]  # Take first line only
    pred = pred.strip(".,!?;:")  # Remove trailing punctuation
    
    # Try exact match first
    for label in VALID_LABELS:
        if label.lower() == pred.lower():
            return label
    
    # Try partial match (label contained in prediction)
    for label in VALID_LABELS:
        if label.lower() in pred.lower():
            return label
    
    # Try matching key words
    keyword_map = {
        "memory": "Memory Management",
        "invalid": "Invalid Access",
        "access": "Invalid Access",
        "uninitialized": "Uninitialized",
        "uninit": "Uninitialized",
        "concurrent": "Concurrency",
        "race": "Concurrency",
        "thread": "Concurrency",
        "logic": "Logic Error",
        "resource": "Resource Leak",
        "leak": "Resource Leak",
        "security": "Security/Portability",
        "portability": "Security/Portability",
        "quality": "Code Quality/Performance",
        "performance": "Code Quality/Performance",
    }
    
    pred_lower = pred.lower()
    for keyword, label in keyword_map.items():
        if keyword in pred_lower:
            return label
    
    return "Unknown"


# Parse all predictions
parsed_predictions = [parse_prediction(p) for p in predictions]

# Show some examples
print("Sample predictions:")
print("=" * 80)
for i in range(min(5, len(test_data))):
    print(f"True: {true_labels[i]:<25} | Raw: {predictions[i][:30]:<30} | Parsed: {parsed_predictions[i]}")
print("=" * 80)

# Calculate accuracy
correct = sum(1 for pred, true in zip(parsed_predictions, true_labels) if pred == true)
total = len(true_labels)
accuracy = (correct / total) * 100

print(f"\n{'=' * 50}")
print(f"CLASSIFICATION ACCURACY: {accuracy:.2f}%")
print(f"Correct: {correct} / {total}")
print(f"{'=' * 50}")

## 15. Generate Confusion Matrix Visualization

Create visualizations to understand model performance across categories.

In [None]:
# Get unique labels from both predictions and true labels
all_labels = sorted(set(true_labels + parsed_predictions))

# Compute confusion matrix
cm = confusion_matrix(true_labels, parsed_predictions, labels=all_labels)

# Create figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Confusion Matrix Heatmap
ax1 = axes[0]
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=all_labels,
    yticklabels=all_labels,
    ax=ax1
)
ax1.set_xlabel('Predicted Label', fontsize=12)
ax1.set_ylabel('True Label', fontsize=12)
ax1.set_title(f'Confusion Matrix\n(Accuracy: {accuracy:.2f}%)', fontsize=14)
plt.setp(ax1.get_xticklabels(), rotation=45, ha='right', fontsize=9)
plt.setp(ax1.get_yticklabels(), rotation=0, fontsize=9)

# Plot 2: Bar chart - Predicted vs Actual counts
ax2 = axes[1]

# Count occurrences
true_counts = {label: true_labels.count(label) for label in all_labels}
pred_counts = {label: parsed_predictions.count(label) for label in all_labels}

x = range(len(all_labels))
width = 0.35

bars1 = ax2.bar([i - width/2 for i in x], [true_counts.get(l, 0) for l in all_labels], 
                width, label='Actual', color='steelblue', alpha=0.8)
bars2 = ax2.bar([i + width/2 for i in x], [pred_counts.get(l, 0) for l in all_labels], 
                width, label='Predicted', color='coral', alpha=0.8)

ax2.set_xlabel('Error Category', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)
ax2.set_title('Actual vs Predicted Distribution', fontsize=14)
ax2.set_xticks(x)
ax2.set_xticklabels(all_labels, rotation=45, ha='right', fontsize=9)
ax2.legend()

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax2.annotate(f'{int(height)}',
                 xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3),
                 textcoords="offset points",
                 ha='center', va='bottom', fontsize=8)

for bar in bars2:
    height = bar.get_height()
    ax2.annotate(f'{int(height)}',
                 xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3),
                 textcoords="offset points",
                 ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig('classification_results.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nVisualization saved to 'classification_results.png'")

## 16. Display Results Summary

Final summary with classification report and memory statistics.

In [None]:
# Print detailed classification report
print("=" * 60)
print("CLASSIFICATION REPORT")
print("=" * 60)
print(classification_report(true_labels, parsed_predictions, labels=all_labels, zero_division=0))

# Summary statistics
print("=" * 60)
print("FINAL SUMMARY")
print("=" * 60)
print(f"Model: {MODEL_NAME}")
print(f"LoRA rank: {LORA_R}, alpha: {LORA_ALPHA}")
print(f"Training steps: {MAX_STEPS}")
print(f"Batch size (effective): {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"-" * 60)
print(f"Test set size: {len(test_data)}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"Correct predictions: {correct}/{total}")

# Memory usage summary
if torch.cuda.is_available():
    print(f"-" * 60)
    print("GPU Memory Usage:")
    print(f"  Peak memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
    print(f"  Current memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"  GPU: {torch.cuda.get_device_name(0)}")

print("=" * 60)

## 16.5 Baseline vs Fine-Tuned Comparison

Compare the performance of the base model (before training) with the fine-tuned model to evaluate the effectiveness of the training data and LoRA fine-tuning.

In [None]:
# =============================================================================
# BASELINE VS FINE-TUNED COMPARISON
# =============================================================================

print("=" * 70)
print("BASELINE VS FINE-TUNED MODEL COMPARISON")
print("=" * 70)

# Calculate improvement
improvement = accuracy - baseline_accuracy
relative_improvement = ((accuracy - baseline_accuracy) / max(baseline_accuracy, 0.01)) * 100

print(f"\n{'Metric':<30} {'Baseline':<15} {'Fine-Tuned':<15} {'Change':<15}")
print("-" * 70)
print(f"{'Accuracy (%)':<30} {baseline_accuracy:<15.2f} {accuracy:<15.2f} {'+' if improvement >= 0 else ''}{improvement:.2f}")
print(f"{'Correct Predictions':<30} {baseline_correct:<15} {correct:<15} {'+' if (correct - baseline_correct) >= 0 else ''}{correct - baseline_correct}")
print(f"{'Total Samples':<30} {baseline_total:<15} {total:<15} {'N/A':<15}")

# Determine if fine-tuning was effective
print("\n" + "=" * 70)
if improvement > 5:
    print("✅ FINE-TUNING EFFECTIVE: Significant accuracy improvement!")
    print(f"   The model improved by {improvement:.2f} percentage points ({relative_improvement:.1f}% relative).")
    print("   The training dataset appears to be effective for this task.")
elif improvement > 0:
    print("⚠️ MARGINAL IMPROVEMENT: Fine-tuning showed slight gains.")
    print(f"   The model improved by {improvement:.2f} percentage points.")
    print("   Consider: More training data, more training steps, or hyperparameter tuning.")
elif improvement == 0:
    print("⚪ NO CHANGE: Fine-tuning did not affect accuracy.")
    print("   Consider: Different learning rate, more diverse training data, or longer training.")
else:
    print("❌ PERFORMANCE DEGRADATION: Fine-tuning reduced accuracy!")
    print(f"   The model degraded by {abs(improvement):.2f} percentage points.")
    print("   Possible causes: Overfitting, poor data quality, or suboptimal hyperparameters.")
print("=" * 70)

# Create comparison visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Plot 1: Accuracy comparison bar chart
ax1 = axes[0]
models = ['Baseline\n(No Fine-Tuning)', 'Fine-Tuned\n(LoRA)']
accuracies = [baseline_accuracy, accuracy]
colors = ['lightcoral' if baseline_accuracy < accuracy else 'steelblue', 
          'forestgreen' if accuracy > baseline_accuracy else 'steelblue']
bars = ax1.bar(models, accuracies, color=colors, edgecolor='black', linewidth=1.5)
ax1.set_ylabel('Accuracy (%)', fontsize=12)
ax1.set_title('Model Accuracy Comparison', fontsize=14)
ax1.set_ylim(0, 100)
for bar, acc in zip(bars, accuracies):
    ax1.annotate(f'{acc:.1f}%', 
                 xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                 xytext=(0, 5), textcoords="offset points",
                 ha='center', va='bottom', fontsize=14, fontweight='bold')

# Add improvement arrow
if improvement != 0:
    mid_x = 0.5
    ax1.annotate('', xy=(1, accuracy), xytext=(0, baseline_accuracy),
                 arrowprops=dict(arrowstyle='->', color='green' if improvement > 0 else 'red', lw=2))
    ax1.text(0.5, (baseline_accuracy + accuracy) / 2, 
             f'{"+%.1f%%" % improvement if improvement > 0 else "%.1f%%" % improvement}',
             ha='center', va='center', fontsize=12, fontweight='bold',
             color='green' if improvement > 0 else 'red',
             bbox=dict(boxstyle='round', facecolor='white', edgecolor='gray'))

# Plot 2: Per-category comparison
ax2 = axes[1]
categories = sorted(set(baseline_true_labels))
baseline_per_cat = {cat: sum(1 for p, t in zip(baseline_parsed, baseline_true_labels) if t == cat and p == t) / 
                         max(baseline_true_labels.count(cat), 1) * 100 for cat in categories}
finetuned_per_cat = {cat: sum(1 for p, t in zip(parsed_predictions, true_labels) if t == cat and p == t) / 
                          max(true_labels.count(cat), 1) * 100 for cat in categories}

x = range(len(categories))
width = 0.35
bars1 = ax2.bar([i - width/2 for i in x], [baseline_per_cat.get(c, 0) for c in categories], 
                width, label='Baseline', color='lightcoral', alpha=0.8)
bars2 = ax2.bar([i + width/2 for i in x], [finetuned_per_cat.get(c, 0) for c in categories], 
                width, label='Fine-Tuned', color='forestgreen', alpha=0.8)
ax2.set_ylabel('Accuracy (%)', fontsize=12)
ax2.set_title('Per-Category Accuracy Comparison', fontsize=14)
ax2.set_xticks(x)
ax2.set_xticklabels(categories, rotation=45, ha='right', fontsize=9)
ax2.legend()
ax2.set_ylim(0, 110)

# Plot 3: Confusion matrix comparison (side by side text summary)
ax3 = axes[2]
ax3.axis('off')

# Create text summary
summary_text = f"""
TRAINING EFFECTIVENESS SUMMARY
{'=' * 40}

Base Model: {MODEL_NAME.split('/')[-1]}
Training Steps: {MAX_STEPS}
LoRA Rank: {LORA_R}
Learning Rate: {LEARNING_RATE}

{'=' * 40}
RESULTS
{'=' * 40}

Baseline Accuracy:  {baseline_accuracy:>6.2f}%
Fine-Tuned Accuracy:{accuracy:>6.2f}%
                    ────────
Improvement:        {'+' if improvement >= 0 else ''}{improvement:>6.2f}%

{'=' * 40}
VERDICT: {'✅ EFFECTIVE' if improvement > 5 else '⚠️ MARGINAL' if improvement > 0 else '❌ INEFFECTIVE'}
{'=' * 40}

Recommendations:
{'• Training data is effective!' if improvement > 5 else '• Consider more training data' if improvement > 0 else '• Review data quality'}
{'• Model learned task-specific patterns' if improvement > 5 else '• Try more training steps' if improvement >= 0 else '• Check for overfitting'}
{'• Ready for deployment' if improvement > 10 else '• May need hyperparameter tuning' if improvement > 0 else '• Reconsider approach'}
"""

ax3.text(0.1, 0.95, summary_text, transform=ax3.transAxes, fontsize=10,
         verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig('baseline_vs_finetuned_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nComparison visualization saved to 'baseline_vs_finetuned_comparison.png'")

## 17. Save the Fine-tuned Model (Optional)

Save the LoRA adapters for future use.

In [None]:
# Save LoRA adapters only (small file, ~50-100 MB)
model.save_pretrained("cpp_error_classifier_lora")
tokenizer.save_pretrained("cpp_error_classifier_lora")
print("LoRA adapters saved to 'cpp_error_classifier_lora/'")

# Optional: Save to Google Drive (uncomment if needed)
# from google.colab import drive
# drive.mount('/content/drive')
# !cp -r cpp_error_classifier_lora /content/drive/MyDrive/

# Optional: Merge and save full model (requires more storage)
# model.save_pretrained_merged("cpp_error_classifier_merged", tokenizer, save_method="merged_16bit")

print("\nTo load the saved model later:")
print("```python")
print("from unsloth import FastLanguageModel")
print("model, tokenizer = FastLanguageModel.from_pretrained('cpp_error_classifier_lora')")
print("```")