In [None]:
# Adjusted Fine-tuning Code for Numeric Classification
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Improved Fine-tuning Code for NACC Complaint Classification

# Install required packages
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
print("Installing required packages...")
try:
    install_package("bitsandbytes")
    install_package("peft")
    install_package("accelerate")
    install_package("transformers>=4.35.0")
    install_package("datasets")
    install_package("scikit-learn")
    install_package("matplotlib")
    install_package("seaborn")
    print("All packages installed successfully!")
except Exception as e:
    print(f"Error installing packages: {e}")

import os
import json
import pandas as pd
import numpy as np
import torch
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================

# Paths
DATASET_PATH = "/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/data/Trainset.csv"
OUTPUT_DIR = "/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/finetune_new"

# Model configuration
MODEL_ID = "scb10x/llama3.2-typhoon2-3b-instruct"
MAX_LENGTH = 512  # Shorter sequences for efficiency
BATCH_SIZE = 8    # Larger batch size
GRADIENT_ACCUMULATION = 2
NUM_EPOCHS = 5    # More epochs for gradual learning
LEARNING_RATE = 1e-5  # Even lower learning rate
LORA_R = 8        # Smaller rank to reduce overfitting
LORA_ALPHA = 16   # Adjusted alpha
LORA_DROPOUT = 0.2  # Higher dropout
WEIGHT_DECAY = 0.1  # Strong regularization

# Data augmentation parameters
AUG_RATIO = 0.3   # 30% of data will be augmented
USE_AUGMENTATION = True
USE_CROSS_VALIDATION = True
CV_FOLDS = 5

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("=" * 80)
print("IMPROVED NACC COMPLAINT CLASSIFICATION FINE-TUNING")
print("=" * 80)

# =============================================================================
# 1. CATEGORY MAPPING
# =============================================================================

print("\n1. Setting up category mapping...")

CATEGORY_MAPPING = {
    "ปฏิบัติหรือละเว้นการปฏิบัติหน้าที่โดยมิชอบ": 0,
    "ทุจริตในการจัดทำงบประมาณ/โครงการ/เบิกจ่ายเงินในโครงการเป็นเท็จ": 1,
    "จัดซื้อจัดจ้าง": 2,
    "ออกเอกสารสิทธิที่ดิน": 3,
    "ยักยอก/เบียดบังเงินหรือทรัพย์สินของราชการ": 4,
    "การบริหารงานบุคคล (การบรรจุ/แต่งตั้ง/เลื่อนตำแหน่ง/โยกย้าย/ลงโทษวินัย)": 5,
    "ร่ำรวยผิดปกติ": 6,
    "เรียกรับสินบน": 7,
    "การขัดกันระหว่างประโยชน์ส่วนบุคคลกับประโยชน์ส่วนรวม": 8,
    "ก่าเกื้นจริยธรรม": 9
}

NUMERIC_TO_CATEGORY = {v: k for k, v in CATEGORY_MAPPING.items()}

ENGLISH_NAMES = {
    0: "Abuse of Power",
    1: "Budget/Project Fraud",
    2: "Procurement Fraud",
    3: "Fraudulent Land Title",
    4: "Embezzlement",
    5: "Personnel Misconduct",
    6: "Unusual Wealth",
    7: "Bribery",
    8: "Conflict of Interest",
    9: "Ethical Misconduct"
}

# Save mapping
mapping_file = os.path.join(OUTPUT_DIR, "category_mapping.json")
with open(mapping_file, 'w', encoding='utf-8') as f:
    json.dump({
        "thai_to_numeric": CATEGORY_MAPPING,
        "numeric_to_thai": NUMERIC_TO_CATEGORY,
        "english_names": ENGLISH_NAMES
    }, f, ensure_ascii=False, indent=2)

print(f"Category mapping saved to: {mapping_file}")

# =============================================================================
# 2. DATA LOADING AND PREPROCESSING
# =============================================================================

print("\n2. Loading and preprocessing data...")

# Load data
try:
    df = pd.read_csv(DATASET_PATH, encoding='utf-8')
    print(f"Loaded dataset with {len(df)} samples")
    print(f"Columns: {list(df.columns)}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Check data structure
print("\nData structure:")
print(df.head())
print(f"\nDataset shape: {df.shape}")

# Map categories to numeric
df['category_numeric'] = df['category'].map(CATEGORY_MAPPING)

# Check for unmapped categories
unmapped = df[df['category_numeric'].isna()]
if len(unmapped) > 0:
    print(f"Warning: {len(unmapped)} unmapped categories found:")
    print(unmapped['category'].unique())
    df = df.dropna(subset=['category_numeric'])

# Convert to int
df['category_numeric'] = df['category_numeric'].astype(int)

print(f"Final dataset size: {len(df)}")

# Check class distribution
class_dist = df['category_numeric'].value_counts().sort_index()
print("\nClass distribution:")
for cat_id, count in class_dist.items():
    english_name = ENGLISH_NAMES[cat_id]
    percentage = (count / len(df)) * 100
    print(f"{cat_id}: {english_name} - {count} samples ({percentage:.1f}%)")

# =============================================================================
# 3. DATA AUGMENTATION FOR SMALL DATASET
# =============================================================================

print("\n3. Implementing data augmentation...")

def simple_back_translation_augment(text):
    """Simple augmentation through word shuffling and synonym replacement"""
    import random

    # Simple word shuffling (keeping sentence structure)
    sentences = text.split('.')
    augmented_sentences = []

    for sentence in sentences:
        words = sentence.strip().split()
        if len(words) > 3:
            # Shuffle middle words occasionally
            if random.random() < 0.3:
                middle = words[1:-1]
                random.shuffle(middle)
                words = [words[0]] + middle + [words[-1]]
        augmented_sentences.append(' '.join(words))

    return '.'.join(augmented_sentences)

def character_level_augment(text):
    """Character-level augmentation for Thai text"""
    import random

    chars = list(text)
    if len(chars) > 10:
        # Random character swapping (very conservative)
        if random.random() < 0.1:
            idx1, idx2 = random.sample(range(1, len(chars)-1), 2)
            chars[idx1], chars[idx2] = chars[idx2], chars[idx1]

    return ''.join(chars)

def augment_text(text, method='shuffle'):
    """Apply text augmentation"""
    if method == 'shuffle':
        return simple_back_translation_augment(text)
    elif method == 'char':
        return character_level_augment(text)
    else:
        return text

def create_augmented_data(X, y, augmentation_ratio=0.3):
    """Create augmented dataset"""
    print(f"Creating augmented data with ratio: {augmentation_ratio}")

    # Calculate samples to augment per class
    unique_classes, class_counts = np.unique(y, return_counts=True)
    min_samples = min(class_counts)

    X_aug = []
    y_aug = []

    for class_id in unique_classes:
        class_indices = np.where(y == class_id)[0]
        class_texts = X[class_indices]

        # Calculate how many samples to augment for this class
        current_count = len(class_indices)
        target_augment = int(current_count * augmentation_ratio)

        # Randomly select samples to augment
        aug_indices = np.random.choice(len(class_texts), size=target_augment, replace=True)

        for idx in aug_indices:
            original_text = class_texts[idx]

            # Apply different augmentation methods
            aug_method = np.random.choice(['shuffle', 'char'], p=[0.7, 0.3])
            augmented_text = augment_text(original_text, method=aug_method)

            # Only add if augmentation actually changed the text
            if augmented_text != original_text:
                X_aug.append(augmented_text)
                y_aug.append(class_id)

    print(f"Generated {len(X_aug)} augmented samples")
    return np.array(X_aug), np.array(y_aug)

# Apply data augmentation if enabled
if USE_AUGMENTATION and len(df) < 1000:  # Only for small datasets
    print("Dataset is small, applying data augmentation...")
    X_aug, y_aug = create_augmented_data(X, y, AUG_RATIO)

    # Combine original and augmented data
    X_combined = np.concatenate([X, X_aug])
    y_combined = np.concatenate([y, y_aug])

    print(f"Original dataset: {len(X)} samples")
    print(f"Augmented dataset: {len(X_combined)} samples")

    # Use combined data for training
    X, y = X_combined, y_combined
else:
    print("Skipping augmentation (dataset size sufficient or disabled)")

# =============================================================================
# 4. STRATIFIED TRAIN-TEST SPLIT WITH CROSS-VALIDATION
# =============================================================================

print("\n4. Creating stratified splits...")

# First, create a hold-out test set (20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Then split remaining data for train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2,  # 20% of remaining 80% = 16% of total
    stratify=y_temp,
    random_state=42
)

print(f"Train set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Validation set: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

# Check distributions
train_dist = pd.Series(y_train).value_counts().sort_index()
val_dist = pd.Series(y_val).value_counts().sort_index()
test_dist = pd.Series(y_test).value_counts().sort_index()

print("\nClass distributions:")
for cat_id in range(10):
    train_count = train_dist.get(cat_id, 0)
    val_count = val_dist.get(cat_id, 0)
    test_count = test_dist.get(cat_id, 0)
    print(f"  Class {cat_id}: Train={train_count}, Val={val_count}, Test={test_count}")

# =============================================================================
# 5. COMPUTE CLASS WEIGHTS FOR IMBALANCED DATA
# =============================================================================

print("\n5. Computing class weights...")

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print("Class weights:")
for cat_id, weight in class_weight_dict.items():
    if cat_id < len(ENGLISH_NAMES):
        print(f"  {cat_id} ({ENGLISH_NAMES[cat_id]}): {weight:.3f}")

# =============================================================================
# 6. CREATE DATASETS WITH VALIDATION
# =============================================================================

print("\n6. Creating datasets...")

# Create datasets
train_dataset = Dataset.from_dict({
    'text': X_train,
    'labels': y_train
})

val_dataset = Dataset.from_dict({
    'text': X_val,
    'labels': y_val
})

test_dataset = Dataset.from_dict({
    'text': X_test,
    'labels': y_test
})

# Create DatasetDict
datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print("Datasets created successfully")

# =============================================================================
# 6. MODEL AND TOKENIZER SETUP
# =============================================================================

print("\n6. Setting up model and tokenizer...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Tokenizer loaded")

# Setup quantization config with fallback
try:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )
    use_quantization = True
    print("4-bit quantization enabled")
except Exception as e:
    print(f"Quantization not available: {e}")
    print("Loading model without quantization...")
    quantization_config = None
    use_quantization = False

# Load model for classification
model_kwargs = {
    "num_labels": 10,  # Number of categories
    "device_map": "auto",
    "trust_remote_code": True
}

if use_quantization and quantization_config is not None:
    model_kwargs["quantization_config"] = quantization_config

try:
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        **model_kwargs
    )
except Exception as e:
    print(f"Error loading model with quantization: {e}")
    print("Retrying without quantization...")
    # Fallback without quantization
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        num_labels=10,
        trust_remote_code=True
    )

print("Base model loaded")

# Apply LoRA with stronger regularization for small datasets
peft_config = LoraConfig(
    r=LORA_R,           # Smaller rank reduces parameters
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,  # Higher dropout
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, peft_config)

# Freeze more layers for small dataset
if len(X_train) < 1000:
    print("Small dataset detected. Applying additional regularization...")

    # Freeze embedding layers
    for name, param in model.named_parameters():
        if 'embed' in name.lower():
            param.requires_grad = False

    # Freeze some transformer layers (freeze bottom layers, train top layers)
    total_layers = len([n for n, p in model.named_parameters() if 'layers.' in n and 'weight' in n])
    layers_to_freeze = total_layers // 3  # Freeze bottom 1/3 of layers

    for name, param in model.named_parameters():
        if 'layers.' in name:
            layer_num = int(name.split('layers.')[1].split('.')[0])
            if layer_num < layers_to_freeze:
                param.requires_grad = False

model.print_trainable_parameters()

# =============================================================================
# 7. TOKENIZATION
# =============================================================================

print("\n7. Tokenizing data...")

def tokenize_function(examples):
    """Tokenize complaints for classification"""
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors=None
    )

# Tokenize datasets
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

print("Tokenization completed")

# =============================================================================
# 8. EVALUATION METRICS
# =============================================================================

def compute_metrics(eval_pred):
    """Compute comprehensive evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Basic metrics
    accuracy = accuracy_score(labels, predictions)

    # Per-class metrics
    precision, recall, f1, support = precision_recall_fscore_support(
        labels, predictions, average=None, zero_division=0
    )

    # Macro and weighted averages
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(labels, predictions, average='weighted', zero_division=0)

    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'precision_macro': np.mean(precision),
        'recall_macro': np.mean(recall)
    }

# =============================================================================
# 9. TRAINING SETUP
# =============================================================================

print("\n9. Setting up training...")

# Training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "checkpoints"),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,
    report_to="none",
    dataloader_pin_memory=False,
    bf16=torch.cuda.is_available(),
    optim="paged_adamw_8bit",
    run_name=f"nacc-classification-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)

# Custom trainer with advanced regularization for small datasets
class AdvancedRegularizedTrainer(Trainer):
    def __init__(self, mixup_alpha=0.2, label_smoothing=0.1, **kwargs):
        super().__init__(**kwargs)
        self.mixup_alpha = mixup_alpha
        self.label_smoothing = label_smoothing

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')

        # Apply class weights
        weight_tensor = torch.tensor(list(class_weight_dict.values()),
                                   dtype=torch.float32, device=labels.device)

        # Label smoothing for regularization
        if self.label_smoothing > 0:
            loss_fct = torch.nn.CrossEntropyLoss(
                weight=weight_tensor,
                label_smoothing=self.label_smoothing
            )
        else:
            loss_fct = torch.nn.CrossEntropyLoss(weight=weight_tensor)

        loss = loss_fct(logits, labels)

        # Add L2 regularization to LoRA parameters
        l2_reg = 0
        for name, param in model.named_parameters():
            if 'lora_' in name and param.requires_grad:
                l2_reg += torch.norm(param, p=2)

        loss = loss + 0.01 * l2_reg  # L2 regularization coefficient

        return (loss, outputs) if return_outputs else loss

    def training_step(self, model, inputs):
        """Custom training step with mixup augmentation"""
        model.train()
        inputs = self._prepare_inputs(inputs)

        # Apply mixup with small probability for regularization
        if np.random.random() < 0.1 and self.mixup_alpha > 0:
            inputs = self._apply_mixup(inputs)

        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs)

        if self.args.n_gpu > 1:
            loss = loss.mean()

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        loss.backward()
        return loss.detach()

    def _apply_mixup(self, inputs):
        """Apply mixup augmentation"""
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels = inputs['labels']

        batch_size = input_ids.size(0)

        # Generate random indices for mixing
        indices = torch.randperm(batch_size)
        lam = np.random.beta(self.mixup_alpha, self.mixup_alpha)

        # Mix inputs (this is simplified - real mixup for text is complex)
        # For simplicity, we'll just mix the labels and apply stronger regularization
        mixed_labels = lam * labels + (1 - lam) * labels[indices]

        inputs['labels'] = mixed_labels.long()  # Convert back to long for classification
        return inputs

# Enhanced training arguments for small datasets
training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "checkpoints"),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,  # Strong weight decay
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",  # Cosine annealing
    logging_steps=10,
    evaluation_strategy="steps",  # More frequent evaluation
    eval_steps=50,  # Evaluate every 50 steps
    save_strategy="steps",
    save_steps=50,
    save_total_limit=5,  # Keep more checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,
    report_to="none",
    dataloader_pin_memory=False,
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,
    optim="adamw_torch",
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,  # Gradient clipping
    dataloader_drop_last=True,  # Drop incomplete batches
    run_name=f"nacc-classification-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)

# Create trainer with advanced regularization
trainer = AdvancedRegularizedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use validation set
    compute_metrics=compute_metrics,
    mixup_alpha=0.2 if len(X_train) < 1000 else 0.0,  # Enable mixup for small datasets
    label_smoothing=0.1,  # Label smoothing for regularization
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.001)
    ]
)

print("Trainer created successfully")

# =============================================================================
# 10. TRAINING
# =============================================================================

print("\n10. Starting training...")
print("=" * 50)

# Train the model
training_result = trainer.train()

print("Training completed!")
print(f"Final training loss: {training_result.training_loss:.4f}")

# =============================================================================
# 11. EVALUATION WITH FINAL TEST SET
# =============================================================================

print("\n11. Evaluating model on validation and test sets...")

# Evaluate on validation set (used during training)
val_results = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("Validation Results:")
for key, value in val_results.items():
    if key.startswith('eval_'):
        print(f"  {key}: {value:.4f}")

# Final evaluation on hold-out test set
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("\nFinal Test Results:")
for key, value in test_results.items():
    if key.startswith('eval_'):
        print(f"  {key}: {value:.4f}")

# Detailed predictions on test set
test_predictions = trainer.predict(tokenized_datasets["test"])
y_pred = np.argmax(test_predictions.predictions, axis=1)
y_true = test_predictions.label_ids

# Classification report
report = classification_report(
    y_true, y_pred,
    target_names=[ENGLISH_NAMES[i] for i in range(10)],
    output_dict=True,
    zero_division=0
)

print("\nDetailed Test Set Classification Report:")
print(classification_report(
    y_true, y_pred,
    target_names=[ENGLISH_NAMES[i] for i in range(10)],
    zero_division=0
))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# =============================================================================
# 12. CROSS-VALIDATION FOR ROBUST EVALUATION (OPTIONAL)
# =============================================================================

if USE_CROSS_VALIDATION and len(X_train) < 1000:
    print("\n12. Performing cross-validation for robust evaluation...")

    from sklearn.model_selection import StratifiedKFold

    # Use original training + validation data for CV
    X_cv = np.concatenate([X_train, X_val])
    y_cv = np.concatenate([y_train, y_val])

    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=42)
    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_cv, y_cv)):
        print(f"Training fold {fold+1}/{CV_FOLDS}...")

        # Create fold datasets
        X_fold_train, X_fold_val = X_cv[train_idx], X_cv[val_idx]
        y_fold_train, y_fold_val = y_cv[train_idx], y_cv[val_idx]

        # Create datasets for this fold
        fold_train_dataset = Dataset.from_dict({
            'text': X_fold_train,
            'labels': y_fold_train
        })

        fold_val_dataset = Dataset.from_dict({
            'text': X_fold_val,
            'labels': y_fold_val
        })

        # Tokenize
        fold_train_tokenized = fold_train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
        fold_val_tokenized = fold_val_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

        # Create new model for this fold (reset weights)
        fold_model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_ID,
            num_labels=10,
            trust_remote_code=True
        )
        fold_model = get_peft_model(fold_model, peft_config)

        # Create trainer for this fold
        fold_trainer = AdvancedRegularizedTrainer(
            model=fold_model,
            args=TrainingArguments(
                output_dir=os.path.join(OUTPUT_DIR, f"cv_fold_{fold}"),
                num_train_epochs=NUM_EPOCHS,
                per_device_train_batch_size=BATCH_SIZE,
                per_device_eval_batch_size=BATCH_SIZE,
                learning_rate=LEARNING_RATE,
                weight_decay=WEIGHT_DECAY,
                evaluation_strategy="no",  # No evaluation during CV training
                save_strategy="no",        # Don't save CV models
                logging_steps=1000,        # Reduce logging
                report_to="none",
                bf16=torch.cuda.is_available(),
            ),
            train_dataset=fold_train_tokenized,
            eval_dataset=fold_val_tokenized,
            compute_metrics=compute_metrics,
        )

        # Train fold
        fold_trainer.train()

        # Evaluate fold
        fold_results = fold_trainer.evaluate()
        cv_scores.append(fold_results['eval_f1_macro'])

        print(f"Fold {fold+1} F1-Macro: {fold_results['eval_f1_macro']:.4f}")

        # Clean up memory
        del fold_model, fold_trainer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    print(f"\nCross-validation results:")
    print(f"Mean F1-Macro: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    print(f"Individual folds: {cv_scores}")

    # Add CV results to final results
    cv_results = {
        "cv_f1_macro_mean": np.mean(cv_scores),
        "cv_f1_macro_std": np.std(cv_scores),
        "cv_scores": cv_scores
    }
else:
    print("Skipping cross-validation")
    cv_results = {}

# =============================================================================
# 12. VISUALIZATION AND SAVING RESULTS
# =============================================================================

print("\n12. Saving results and visualizations...")

# Save comprehensive results
results = {
    "training_args": training_args.to_dict(),
    "model_config": {
        "base_model": MODEL_ID,
        "max_length": MAX_LENGTH,
        "lora_config": {
            "r": LORA_R,
            "alpha": LORA_ALPHA,
            "dropout": LORA_DROPOUT
        },
        "regularization": {
            "weight_decay": WEIGHT_DECAY,
            "label_smoothing": 0.1,
            "gradient_clipping": 1.0,
            "layer_freezing": len(X_train) < 1000
        }
    },
    "dataset_info": {
        "total_samples": len(X),
        "train_samples": len(X_train),
        "val_samples": len(X_val),
        "test_samples": len(X_test),
        "augmentation_applied": USE_AUGMENTATION,
        "augmentation_ratio": AUG_RATIO if USE_AUGMENTATION else 0
    },
    "training_results": {
        "final_loss": training_result.training_loss,
        "train_steps": training_result.global_step
    },
    "validation_results": val_results,
    "test_results": test_results,
    "classification_report": report,
    "confusion_matrix": cm.tolist(),
    "class_distribution": {
        "train": train_dist.to_dict(),
        "validation": val_dist.to_dict(),
        "test": test_dist.to_dict()
    },
    "class_weights": class_weight_dict,
    "cross_validation": cv_results,
    "regularization_techniques": [
        "LoRA with reduced rank",
        "Higher dropout rate",
        "Strong weight decay",
        "Label smoothing",
        "Gradient clipping",
        "Early stopping",
        "Layer freezing for small datasets",
        "Data augmentation",
        "Class balancing",
        "Cosine learning rate schedule"
    ],
    "timestamp": datetime.now().isoformat()
}

# Save results
results_file = os.path.join(OUTPUT_DIR, "training_results.json")
with open(results_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# Create confusion matrix plot
plt.figure(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=[ENGLISH_NAMES[i] for i in range(10)],
    yticklabels=[ENGLISH_NAMES[i] for i in range(10)]
)
plt.title('Confusion Matrix - NACC Complaint Classification')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=300, bbox_inches='tight')
plt.close()

# Create class distribution plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Train distribution
train_dist.plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Training Set Class Distribution')
ax1.set_xlabel('Category ID')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)

# Test distribution
test_dist.plot(kind='bar', ax=ax2, color='lightcoral')
ax2.set_title('Test Set Class Distribution')
ax2.set_xlabel('Category ID')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "class_distribution.png"), dpi=300, bbox_inches='tight')
plt.close()

print(f"Visualizations saved to: {OUTPUT_DIR}")

# =============================================================================
# 13. SAVE MODEL
# =============================================================================

print("\n13. Saving trained model...")

# Save model and tokenizer
model_save_path = os.path.join(OUTPUT_DIR, "model")
tokenizer_save_path = os.path.join(OUTPUT_DIR, "tokenizer")

trainer.save_model(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model saved to: {model_save_path}")
print(f"Tokenizer saved to: {tokenizer_save_path}")

# =============================================================================
# 14. SAVE TEST SET
# =============================================================================

print("\n14. Saving test set for future evaluation...")

# Create test set with predictions
test_df = pd.DataFrame({
    'complaint': X_test,
    'true_category': y_true,
    'predicted_category': y_pred,
    'true_category_thai': [NUMERIC_TO_CATEGORY[cat] for cat in y_true],
    'predicted_category_thai': [NUMERIC_TO_CATEGORY[cat] for cat in y_pred],
    'true_category_english': [ENGLISH_NAMES[cat] for cat in y_true],
    'predicted_category_english': [ENGLISH_NAMES[cat] for cat in y_pred],
    'correct_prediction': y_true == y_pred
})

test_csv_path = os.path.join(OUTPUT_DIR, "test_set_with_predictions.csv")
test_df.to_csv(test_csv_path, index=False, encoding='utf-8')

print(f"Test set with predictions saved to: {test_csv_path}")

# =============================================================================
# 15. FINAL SUMMARY
# =============================================================================

print("\n" + "=" * 80)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 80)

print(f"\nKey Results:")
print(f"  Accuracy: {eval_results['eval_accuracy']:.3f}")
print(f"  Macro F1: {eval_results['eval_f1_macro']:.3f}")
print(f"  Weighted F1: {eval_results['eval_f1_weighted']:.3f}")

print(f"\nFiles saved to: {OUTPUT_DIR}")
print(f"  - Model: {model_save_path}")
print(f"  - Tokenizer: {tokenizer_save_path}")
print(f"  - Results: {results_file}")
print(f"  - Test set: {test_csv_path}")
print(f"  - Confusion matrix: confusion_matrix.png")
print(f"  - Class distribution: class_distribution.png")

print(f"\nModel ready for deployment and evaluation!")

Installing required packages...
All packages installed successfully!
IMPROVED NACC COMPLAINT CLASSIFICATION FINE-TUNING

1. Setting up category mapping...
Category mapping saved to: /content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/finetune_new/category_mapping.json

2. Loading and preprocessing data...
Error loading dataset: [Errno 2] No such file or directory: '/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/data/Trainset.csv'


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/data/Trainset.csv'

In [None]:
# Improved Fine-tuning Code for NACC Complaint Classification

# Install required packages
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
print("Installing required packages...")
try:
    install_package("bitsandbytes")
    install_package("peft")
    install_package("accelerate")
    install_package("transformers>=4.35.0")
    install_package("datasets")
    install_package("scikit-learn")
    install_package("matplotlib")
    install_package("seaborn")
    print("All packages installed successfully!")
except Exception as e:
    print(f"Error installing packages: {e}")

import os
import json
import pandas as pd
import numpy as np
import torch
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================

# Paths
DATASET_PATH = "/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/data/Trainset.csv"
OUTPUT_DIR = "/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/finetune_new"

# Model configuration
MODEL_ID = "scb10x/llama3.2-typhoon2-3b-instruct"
MAX_LENGTH = 512  # Shorter sequences for efficiency
BATCH_SIZE = 8    # Larger batch size
GRADIENT_ACCUMULATION = 2
NUM_EPOCHS = 5    # More epochs for gradual learning
LEARNING_RATE = 1e-5  # Even lower learning rate
LORA_R = 8        # Smaller rank to reduce overfitting
LORA_ALPHA = 16   # Adjusted alpha
LORA_DROPOUT = 0.2  # Higher dropout
WEIGHT_DECAY = 0.1  # Strong regularization

# Data augmentation parameters
AUG_RATIO = 0.3   # 30% of data will be augmented
USE_AUGMENTATION = True
USE_CROSS_VALIDATION = True
CV_FOLDS = 5

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("=" * 80)
print("IMPROVED NACC COMPLAINT CLASSIFICATION FINE-TUNING")
print("=" * 80)

# =============================================================================
# 1. CATEGORY MAPPING
# =============================================================================

print("\n1. Setting up category mapping...")

CATEGORY_MAPPING = {
    "ปฏิบัติหรือละเว้นการปฏิบัติหน้าที่โดยมิชอบ": 0,
    "ทุจริตในการจัดทำงบประมาณ/โครงการ/เบิกจ่ายเงินในโครงการเป็นเท็จ": 1,
    "จัดซื้อจัดจ้าง": 2,
    "ออกเอกสารสิทธิที่ดิน": 3,
    "ยักยอก/เบียดบังเงินหรือทรัพย์สินของราชการ": 4,
    "การบริหารงานบุคคล (การบรรจุ/แต่งตั้ง/เลื่อนตำแหน่ง/โยกย้าย/ลงโทษวินัย)": 5,
    "ร่ำรวยผิดปกติ": 6,
    "เรียกรับสินบน": 7,
    "การขัดกันระหว่างประโยชน์ส่วนบุคคลกับประโยชน์ส่วนรวม": 8,
    "ก่าเกื้นจริยธรรม": 9
}

NUMERIC_TO_CATEGORY = {v: k for k, v in CATEGORY_MAPPING.items()}

ENGLISH_NAMES = {
    0: "Abuse of Power",
    1: "Budget/Project Fraud",
    2: "Procurement Fraud",
    3: "Fraudulent Land Title",
    4: "Embezzlement",
    5: "Personnel Misconduct",
    6: "Unusual Wealth",
    7: "Bribery",
    8: "Conflict of Interest",
    9: "Ethical Misconduct"
}

# Save mapping
mapping_file = os.path.join(OUTPUT_DIR, "category_mapping.json")
with open(mapping_file, 'w', encoding='utf-8') as f:
    json.dump({
        "thai_to_numeric": CATEGORY_MAPPING,
        "numeric_to_thai": NUMERIC_TO_CATEGORY,
        "english_names": ENGLISH_NAMES
    }, f, ensure_ascii=False, indent=2)

print(f"Category mapping saved to: {mapping_file}")

# =============================================================================
# 2. DATA LOADING AND PREPROCESSING
# =============================================================================

print("\n2. Loading and preprocessing data...")

# Load data
try:
    df = pd.read_csv(DATASET_PATH, encoding='utf-8')
    print(f"Loaded dataset with {len(df)} samples")
    print(f"Columns: {list(df.columns)}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Check data structure
print("\nData structure:")
print(df.head())
print(f"\nDataset shape: {df.shape}")

# Map categories to numeric
df['category_numeric'] = df['category'].map(CATEGORY_MAPPING)

# Check for unmapped categories
unmapped = df[df['category_numeric'].isna()]
if len(unmapped) > 0:
    print(f"Warning: {len(unmapped)} unmapped categories found:")
    print(unmapped['category'].unique())
    df = df.dropna(subset=['category_numeric'])

# Convert to int
df['category_numeric'] = df['category_numeric'].astype(int)

print(f"Final dataset size: {len(df)}")

# Extract features and labels
X = df['complaint'].values
y = df['category_numeric'].values

# Check class distribution
class_dist = df['category_numeric'].value_counts().sort_index()
print("\nClass distribution:")
for cat_id, count in class_dist.items():
    english_name = ENGLISH_NAMES[cat_id]
    percentage = (count / len(df)) * 100
    print(f"{cat_id}: {english_name} - {count} samples ({percentage:.1f}%)")

# =============================================================================
# 3. DATA AUGMENTATION FOR SMALL DATASET
# =============================================================================

print("\n3. Implementing data augmentation...")

def simple_back_translation_augment(text):
    """Simple augmentation through word shuffling and synonym replacement"""
    import random

    # Simple word shuffling (keeping sentence structure)
    sentences = text.split('.')
    augmented_sentences = []

    for sentence in sentences:
        words = sentence.strip().split()
        if len(words) > 3:
            # Shuffle middle words occasionally
            if random.random() < 0.3:
                middle = words[1:-1]
                random.shuffle(middle)
                words = [words[0]] + middle + [words[-1]]
        augmented_sentences.append(' '.join(words))

    return '.'.join(augmented_sentences)

def character_level_augment(text):
    """Character-level augmentation for Thai text"""
    import random

    chars = list(text)
    if len(chars) > 10:
        # Random character swapping (very conservative)
        if random.random() < 0.1:
            idx1, idx2 = random.sample(range(1, len(chars)-1), 2)
            chars[idx1], chars[idx2] = chars[idx2], chars[idx1]

    return ''.join(chars)

def augment_text(text, method='shuffle'):
    """Apply text augmentation"""
    if method == 'shuffle':
        return simple_back_translation_augment(text)
    elif method == 'char':
        return character_level_augment(text)
    else:
        return text

def create_augmented_data(X, y, augmentation_ratio=0.3):
    """Create augmented dataset"""
    print(f"Creating augmented data with ratio: {augmentation_ratio}")

    # Calculate samples to augment per class
    unique_classes, class_counts = np.unique(y, return_counts=True)

    X_aug = []
    y_aug = []

    for class_id in unique_classes:
        class_indices = np.where(y == class_id)[0]
        class_texts = X[class_indices]

        # Calculate how many samples to augment for this class
        current_count = len(class_indices)
        target_augment = int(current_count * augmentation_ratio)

        # Randomly select samples to augment
        aug_indices = np.random.choice(len(class_texts), size=target_augment, replace=True)

        for idx in aug_indices:
            original_text = class_texts[idx]

            # Apply different augmentation methods
            aug_method = np.random.choice(['shuffle', 'char'], p=[0.7, 0.3])
            augmented_text = augment_text(original_text, method=aug_method)

            # Only add if augmentation actually changed the text
            if augmented_text != original_text:
                X_aug.append(augmented_text)
                y_aug.append(class_id)

    print(f"Generated {len(X_aug)} augmented samples")
    return np.array(X_aug), np.array(y_aug)

# Apply data augmentation if enabled
if USE_AUGMENTATION and len(df) < 1000:  # Only for small datasets
    print("Dataset is small, applying data augmentation...")
    X_aug, y_aug = create_augmented_data(X, y, AUG_RATIO)

    # Combine original and augmented data
    X_combined = np.concatenate([X, X_aug])
    y_combined = np.concatenate([y, y_aug])

    print(f"Original dataset: {len(X)} samples")
    print(f"Augmented dataset: {len(X_combined)} samples")

    # Use combined data for training
    X, y = X_combined, y_combined
else:
    print("Skipping augmentation (dataset size sufficient or disabled)")

# =============================================================================
# 4. STRATIFIED TRAIN-TEST SPLIT WITH CROSS-VALIDATION
# =============================================================================

print("\n4. Creating stratified splits...")

# First, create a hold-out test set (20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Then split remaining data for train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2,  # 20% of remaining 80% = 16% of total
    stratify=y_temp,
    random_state=42
)

print(f"Train set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Validation set: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

# Check distributions
train_dist = pd.Series(y_train).value_counts().sort_index()
val_dist = pd.Series(y_val).value_counts().sort_index()
test_dist = pd.Series(y_test).value_counts().sort_index()

print("\nClass distributions:")
for cat_id in range(10):
    train_count = train_dist.get(cat_id, 0)
    val_count = val_dist.get(cat_id, 0)
    test_count = test_dist.get(cat_id, 0)
    print(f"  Class {cat_id}: Train={train_count}, Val={val_count}, Test={test_count}")

# =============================================================================
# 5. COMPUTE CLASS WEIGHTS FOR IMBALANCED DATA
# =============================================================================

print("\n5. Computing class weights...")

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print("Class weights:")
for cat_id, weight in class_weight_dict.items():
    if cat_id < len(ENGLISH_NAMES):
        print(f"  {cat_id} ({ENGLISH_NAMES[cat_id]}): {weight:.3f}")

# =============================================================================
# 6. CREATE DATASETS WITH VALIDATION
# =============================================================================

print("\n6. Creating datasets...")

# Create datasets
train_dataset = Dataset.from_dict({
    'text': X_train,
    'labels': y_train
})

val_dataset = Dataset.from_dict({
    'text': X_val,
    'labels': y_val
})

test_dataset = Dataset.from_dict({
    'text': X_test,
    'labels': y_test
})

# Create DatasetDict
datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print("Datasets created successfully")

# =============================================================================
# 7. MODEL AND TOKENIZER SETUP
# =============================================================================

print("\n7. Setting up model and tokenizer...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Tokenizer loaded")

# Setup quantization config with fallback
try:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )
    use_quantization = True
    print("4-bit quantization enabled")
except Exception as e:
    print(f"Quantization not available: {e}")
    print("Loading model without quantization...")
    quantization_config = None
    use_quantization = False

# Load model for classification
model_kwargs = {
    "num_labels": 10,  # Number of categories
    "device_map": "auto",
    "trust_remote_code": True
}

if use_quantization and quantization_config is not None:
    model_kwargs["quantization_config"] = quantization_config

try:
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        **model_kwargs
    )
except Exception as e:
    print(f"Error loading model with quantization: {e}")
    print("Retrying without quantization...")
    # Fallback without quantization
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        num_labels=10,
        trust_remote_code=True
    )

print("Base model loaded")

# Apply LoRA with stronger regularization for small datasets
peft_config = LoraConfig(
    r=LORA_R,           # Smaller rank reduces parameters
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,  # Higher dropout
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, peft_config)

# Freeze more layers for small dataset
if len(X_train) < 1000:
    print("Small dataset detected. Applying additional regularization...")

    # Freeze embedding layers
    for name, param in model.named_parameters():
        if 'embed' in name.lower():
            param.requires_grad = False

    # Freeze some transformer layers (freeze bottom layers, train top layers)
    total_layers = len([n for n, p in model.named_parameters() if 'layers.' in n and 'weight' in n])
    layers_to_freeze = total_layers // 3  # Freeze bottom 1/3 of layers

    for name, param in model.named_parameters():
        if 'layers.' in name:
            layer_num = int(name.split('layers.')[1].split('.')[0])
            if layer_num < layers_to_freeze:
                param.requires_grad = False

model.print_trainable_parameters()

# =============================================================================
# 8. TOKENIZATION
# =============================================================================

print("\n8. Tokenizing data...")

def tokenize_function(examples):
    """Tokenize complaints for classification"""
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors=None
    )

# Tokenize datasets
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

print("Tokenization completed")

# =============================================================================
# 9. EVALUATION METRICS
# =============================================================================

def compute_metrics(eval_pred):
    """Compute comprehensive evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Basic metrics
    accuracy = accuracy_score(labels, predictions)

    # Per-class metrics
    precision, recall, f1, support = precision_recall_fscore_support(
        labels, predictions, average=None, zero_division=0
    )

    # Macro and weighted averages
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(labels, predictions, average='weighted', zero_division=0)

    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'precision_macro': np.mean(precision),
        'recall_macro': np.mean(recall)
    }

# =============================================================================
# 10. TRAINING SETUP
# =============================================================================

print("\n10. Setting up training...")

# Custom trainer with advanced regularization for small datasets
class AdvancedRegularizedTrainer(Trainer):
    def __init__(self, class_weights=None, mixup_alpha=0.2, label_smoothing=0.1, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights
        self.mixup_alpha = mixup_alpha
        self.label_smoothing = label_smoothing

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')

        # Apply class weights
        if self.class_weights is not None:
            weight_tensor = torch.tensor(list(self.class_weights.values()),
                                       dtype=torch.float32, device=labels.device)
        else:
            weight_tensor = None

        # Label smoothing for regularization
        if self.label_smoothing > 0:
            loss_fct = torch.nn.CrossEntropyLoss(
                weight=weight_tensor,
                label_smoothing=self.label_smoothing
            )
        else:
            loss_fct = torch.nn.CrossEntropyLoss(weight=weight_tensor)

        loss = loss_fct(logits, labels)

        # Add L2 regularization to LoRA parameters
        l2_reg = 0
        for name, param in model.named_parameters():
            if 'lora_' in name and param.requires_grad:
                l2_reg += torch.norm(param, p=2)

        loss = loss + 0.01 * l2_reg  # L2 regularization coefficient

        return (loss, outputs) if return_outputs else loss

# Enhanced training arguments for small datasets
# Using eval_strategy instead of evaluation_strategy for compatibility
training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "checkpoints"),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=10,
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,
    report_to="none",
    dataloader_pin_memory=False,
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_available(),
    optim="adamw_torch",
    max_grad_norm=1.0,
    dataloader_drop_last=True,
    run_name=f"nacc-classification-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)

# Create trainer with advanced regularization
trainer = AdvancedRegularizedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use validation set
    compute_metrics=compute_metrics,
    class_weights=class_weight_dict,
    mixup_alpha=0.2 if len(X_train) < 1000 else 0.0,  # Enable mixup for small datasets
    label_smoothing=0.1,  # Label smoothing for regularization
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.001)
    ]
)

print("Trainer created successfully")

# =============================================================================
# 11. TRAINING
# =============================================================================

print("\n11. Starting training...")
print("=" * 50)

# Train the model
training_result = trainer.train()

print("Training completed!")
print(f"Final training loss: {training_result.training_loss:.4f}")

# =============================================================================
# 12. EVALUATION WITH FINAL TEST SET
# =============================================================================

print("\n12. Evaluating model on validation and test sets...")

# Evaluate on validation set (used during training)
val_results = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("Validation Results:")
for key, value in val_results.items():
    if key.startswith('eval_'):
        print(f"  {key}: {value:.4f}")

# Final evaluation on hold-out test set
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("\nFinal Test Results:")
for key, value in test_results.items():
    if key.startswith('eval_'):
        print(f"  {key}: {value:.4f}")

# Detailed predictions on test set
test_predictions = trainer.predict(tokenized_datasets["test"])
y_pred = np.argmax(test_predictions.predictions, axis=1)
y_true = test_predictions.label_ids

# Classification report
report = classification_report(
    y_true, y_pred,
    target_names=[ENGLISH_NAMES[i] for i in range(10)],
    output_dict=True,
    zero_division=0
)

print("\nDetailed Test Set Classification Report:")
print(classification_report(
    y_true, y_pred,
    target_names=[ENGLISH_NAMES[i] for i in range(10)],
    zero_division=0
))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# =============================================================================
# 13. CROSS-VALIDATION FOR ROBUST EVALUATION (OPTIONAL)
# =============================================================================

if USE_CROSS_VALIDATION and len(X_train) < 1000:
    print("\n13. Performing cross-validation for robust evaluation...")

    # Use original training + validation data for CV
    X_cv = np.concatenate([X_train, X_val])
    y_cv = np.concatenate([y_train, y_val])

    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=42)
    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_cv, y_cv)):
        print(f"Training fold {fold+1}/{CV_FOLDS}...")

        # Create fold datasets
        X_fold_train, X_fold_val = X_cv[train_idx], X_cv[val_idx]
        y_fold_train, y_fold_val = y_cv[train_idx], y_cv[val_idx]

        # Create datasets for this fold
        fold_train_dataset = Dataset.from_dict({
            'text': X_fold_train,
            'labels': y_fold_train
        })

        fold_val_dataset = Dataset.from_dict({
            'text': X_fold_val,
            'labels': y_fold_val
        })

        # Tokenize
        fold_train_tokenized = fold_train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
        fold_val_tokenized = fold_val_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

        # Create new model for this fold (reset weights)
        fold_model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_ID,
            num_labels=10,
            trust_remote_code=True
        )
        fold_model = get_peft_model(fold_model, peft_config)

        # Create trainer for this fold
        fold_training_args = TrainingArguments(
            output_dir=os.path.join(OUTPUT_DIR, f"cv_fold_{fold}"),
            num_train_epochs=NUM_EPOCHS,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE,
            weight_decay=WEIGHT_DECAY,
            eval_strategy="no",  # Changed from evaluation_strategy
            save_strategy="no",
            logging_steps=1000,
            report_to="none",
            bf16=torch.cuda.is_available(),
        )

        fold_trainer = AdvancedRegularizedTrainer(
            model=fold_model,
            args=fold_training_args,
            train_dataset=fold_train_tokenized,
            eval_dataset=fold_val_tokenized,
            compute_metrics=compute_metrics,
            class_weights=class_weight_dict
        )

        # Train fold
        fold_trainer.train()

        # Evaluate fold
        fold_results = fold_trainer.evaluate()
        cv_scores.append(fold_results['eval_f1_macro'])

        print(f"Fold {fold+1} F1-Macro: {fold_results['eval_f1_macro']:.4f}")

        # Clean up memory
        del fold_model, fold_trainer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    print(f"\nCross-validation results:")
    print(f"Mean F1-Macro: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    print(f"Individual folds: {cv_scores}")

    # Add CV results to final results
    cv_results = {
        "cv_f1_macro_mean": np.mean(cv_scores),
        "cv_f1_macro_std": np.std(cv_scores),
        "cv_scores": cv_scores
    }
else:
    print("\n13. Skipping cross-validation")
    cv_results = {}

# =============================================================================
# 14. VISUALIZATION AND SAVING RESULTS
# =============================================================================

print("\n14. Saving results and visualizations...")

# Save comprehensive results
results = {
    "training_args": training_args.to_dict(),
    "model_config": {
        "base_model": MODEL_ID,
        "max_length": MAX_LENGTH,
        "lora_config": {
            "r": LORA_R,
            "alpha": LORA_ALPHA,
            "dropout": LORA_DROPOUT
        },
        "regularization": {
            "weight_decay": WEIGHT_DECAY,
            "label_smoothing": 0.1,
            "gradient_clipping": 1.0,
            "layer_freezing": len(X_train) < 1000
        }
    },
    "dataset_info": {
        "total_samples": len(X),
        "train_samples": len(X_train),
        "val_samples": len(X_val),
        "test_samples": len(X_test),
        "augmentation_applied": USE_AUGMENTATION,
        "augmentation_ratio": AUG_RATIO if USE_AUGMENTATION else 0
    },
    "training_results": {
        "final_loss": training_result.training_loss,
        "train_steps": training_result.global_step
    },
    "validation_results": val_results,
    "test_results": test_results,
    "classification_report": report,
    "confusion_matrix": cm.tolist(),
    "class_distribution": {
        "train": train_dist.to_dict(),
        "validation": val_dist.to_dict(),
        "test": test_dist.to_dict()
    },
    "class_weights": class_weight_dict,
    "cross_validation": cv_results,
    "timestamp": datetime.now().isoformat()
}

# Save results
results_file = os.path.join(OUTPUT_DIR, "training_results.json")
with open(results_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Results saved to: {results_file}")

# Create confusion matrix plot
plt.figure(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=[ENGLISH_NAMES[i] for i in range(10)],
    yticklabels=[ENGLISH_NAMES[i] for i in range(10)]
)
plt.title('Confusion Matrix - NACC Complaint Classification')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=300, bbox_inches='tight')
plt.close()

# Create class distribution plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Train distribution
train_dist.plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Training Set Class Distribution')
ax1.set_xlabel('Category ID')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)

# Test distribution
test_dist.plot(kind='bar', ax=ax2, color='lightcoral')
ax2.set_title('Test Set Class Distribution')
ax2.set_xlabel('Category ID')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "class_distribution.png"), dpi=300, bbox_inches='tight')
plt.close()

print(f"Visualizations saved to: {OUTPUT_DIR}")

# =============================================================================
# 15. SAVE MODEL
# =============================================================================

print("\n15. Saving trained model...")

# Save model and tokenizer
model_save_path = os.path.join(OUTPUT_DIR, "model")
tokenizer_save_path = os.path.join(OUTPUT_DIR, "tokenizer")

trainer.save_model(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model saved to: {model_save_path}")
print(f"Tokenizer saved to: {tokenizer_save_path}")

# =============================================================================
# 16. SAVE TEST SET WITH PREDICTIONS
# =============================================================================

print("\n16. Saving test set with predictions for future evaluation...")

# Create test set with predictions
test_df = pd.DataFrame({
    'complaint': X_test,
    'true_category': y_true,
    'predicted_category': y_pred,
    'true_category_thai': [NUMERIC_TO_CATEGORY[cat] for cat in y_true],
    'predicted_category_thai': [NUMERIC_TO_CATEGORY[cat] for cat in y_pred],
    'true_category_english': [ENGLISH_NAMES[cat] for cat in y_true],
    'predicted_category_english': [ENGLISH_NAMES[cat] for cat in y_pred],
    'correct_prediction': y_true == y_pred
})

test_csv_path = os.path.join(OUTPUT_DIR, "test_set_with_predictions.csv")
test_df.to_csv(test_csv_path, index=False, encoding='utf-8')

print(f"Test set with predictions saved to: {test_csv_path}")

# =============================================================================
# 17. FINAL SUMMARY
# =============================================================================

print("\n" + "=" * 80)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 80)

print(f"\nKey Results:")
print(f"  Accuracy: {test_results['eval_accuracy']:.3f}")
print(f"  Macro F1: {test_results['eval_f1_macro']:.3f}")
print(f"  Weighted F1: {test_results['eval_f1_weighted']:.3f}")

print(f"\nFiles saved to: {OUTPUT_DIR}")
print(f"  - Model: {model_save_path}")
print(f"  - Tokenizer: {tokenizer_save_path}")
print(f"  - Results: {results_file}")
print(f"  - Test set: {test_csv_path}")
print(f"  - Confusion matrix: confusion_matrix.png")
print(f"  - Class distribution: class_distribution.png")

print(f"\nModel ready for deployment and evaluation!")

# =============================================================================
# 18. INFERENCE FUNCTION (FOR TESTING)
# =============================================================================

def predict_complaint(text, model=model, tokenizer=tokenizer):
    """
    Predict the category of a complaint text

    Args:
        text: Input complaint text
        model: Trained model
        tokenizer: Tokenizer

    Returns:
        Dictionary with prediction results
    """
    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    # Move to device
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)

    # Get top prediction
    pred_idx = torch.argmax(probs, dim=-1).item()
    confidence = probs[0, pred_idx].item()

    # Get top 3 predictions
    top3_probs, top3_indices = torch.topk(probs[0], k=min(3, len(probs[0])))

    top3_predictions = []
    for prob, idx in zip(top3_probs, top3_indices):
        top3_predictions.append({
            "category_id": idx.item(),
            "category_thai": NUMERIC_TO_CATEGORY[idx.item()],
            "category_english": ENGLISH_NAMES[idx.item()],
            "confidence": prob.item()
        })

    return {
        "predicted_category_id": pred_idx,
        "predicted_category_thai": NUMERIC_TO_CATEGORY[pred_idx],
        "predicted_category_english": ENGLISH_NAMES[pred_idx],
        "confidence": confidence,
        "top3_predictions": top3_predictions
    }

# Test the inference function
print("\n" + "=" * 80)
print("TESTING INFERENCE FUNCTION")
print("=" * 80)

# Example test
test_text = "มีการเรียกรับเงินสินบนในการอนุมัติโครงการก่อสร้าง"
print(f"\nTest text: {test_text}")

result = predict_complaint(test_text)
print(f"\nPrediction: {result['predicted_category_thai']}")
print(f"English: {result['predicted_category_english']}")
print(f"Confidence: {result['confidence']:.2%}")

print("\nTop 3 predictions:")
for i, pred in enumerate(result['top3_predictions'], 1):
    print(f"  {i}. {pred['category_english']} ({pred['confidence']:.2%})")

print("\n" + "=" * 80)
print("ALL PROCESSES COMPLETED SUCCESSFULLY!")
print("=" * 80)

Installing required packages...
All packages installed successfully!
IMPROVED NACC COMPLAINT CLASSIFICATION FINE-TUNING

1. Setting up category mapping...
Category mapping saved to: /content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/finetune_new/category_mapping.json

2. Loading and preprocessing data...
Loaded dataset with 800 samples
Columns: ['complaint', 'category']

Data structure:
                                           complaint  \
0  คณะผู้บริหารองค์การบริหารและสมาชิกสภาตำบลท่าดอ...   
1  จัดซื้อจัดจ้างโดยคัดเลือกจากผู้รับจ้างที่ผู้ถู...   
2  ผู้ถูกร้องใช้อำนาจหน้าที่กระทำการเอื้อประโยชน์...   
3  เบียดบังเงินของวัดโคกเข็ม รวมจำนวน ๓,๑๔๙,๙๙๙ บ...   
4  ดำเนินการขุดบ่อน้ำ เพื่อแก้ไขปัญหาภัยแล้งในเขต...   

                                            category  
0  การขัดกันระหว่างประโยชน์ส่วนบุคคลกับประโยชน์ส่...  
1  การขัดกันระหว่างประโยชน์ส่วนบุคคลกับประโยชน์ส่...  
2  การขัดกันระหว่างประโยชน์ส่วนบุคคลกับประโยชน์ส่...  
3  การขัดกันระหว่างปร

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at scb10x/llama3.2-typhoon2-3b-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model loaded
Small dataset detected. Applying additional regularization...
trainable params: 30,720 || all params: 3,224,968,192 || trainable%: 0.0010

8. Tokenizing data...


Map:   0%|          | 0/567 [00:00<?, ? examples/s]

Map:   0%|          | 0/142 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

Tokenization completed

10. Setting up training...
Trainer created successfully

11. Starting training...


TypeError: AdvancedRegularizedTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'

In [None]:
# Improved Fine-tuning Code for NACC Complaint Classification

# Install required packages
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
print("Installing required packages...")
try:
    install_package("bitsandbytes")
    install_package("peft")
    install_package("accelerate")
    install_package("transformers>=4.35.0")
    install_package("datasets")
    install_package("scikit-learn")
    install_package("matplotlib")
    install_package("seaborn")
    print("All packages installed successfully!")
except Exception as e:
    print(f"Error installing packages: {e}")

import os
import json
import pandas as pd
import numpy as np
import torch
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================

# Paths
DATASET_PATH = "/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/data/Trainset.csv"
OUTPUT_DIR = "/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/finetune_new"

# Model configuration
MODEL_ID = "scb10x/llama3.2-typhoon2-3b-instruct"
MAX_LENGTH = 512  # Shorter sequences for efficiency
BATCH_SIZE = 8    # Larger batch size
GRADIENT_ACCUMULATION = 2
NUM_EPOCHS = 5    # More epochs for gradual learning
LEARNING_RATE = 1e-5  # Even lower learning rate
LORA_R = 8        # Smaller rank to reduce overfitting
LORA_ALPHA = 16   # Adjusted alpha
LORA_DROPOUT = 0.2  # Higher dropout
WEIGHT_DECAY = 0.1  # Strong regularization

# Data augmentation parameters
AUG_RATIO = 0.3   # 30% of data will be augmented
USE_AUGMENTATION = True
USE_CROSS_VALIDATION = True
CV_FOLDS = 5

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("=" * 80)
print("IMPROVED NACC COMPLAINT CLASSIFICATION FINE-TUNING")
print("=" * 80)

# =============================================================================
# 1. CATEGORY MAPPING
# =============================================================================

print("\n1. Setting up category mapping...")

CATEGORY_MAPPING = {
    "ปฏิบัติหรือละเว้นการปฏิบัติหน้าที่โดยมิชอบ": 0,
    "ทุจริตในการจัดทำงบประมาณ/โครงการ/เบิกจ่ายเงินในโครงการเป็นเท็จ": 1,
    "จัดซื้อจัดจ้าง": 2,
    "ออกเอกสารสิทธิที่ดิน": 3,
    "ยักยอก/เบียดบังเงินหรือทรัพย์สินของราชการ": 4,
    "การบริหารงานบุคคล (การบรรจุ/แต่งตั้ง/เลื่อนตำแหน่ง/โยกย้าย/ลงโทษวินัย)": 5,
    "ร่ำรวยผิดปกติ": 6,
    "เรียกรับสินบน": 7,
    "การขัดกันระหว่างประโยชน์ส่วนบุคคลกับประโยชน์ส่วนรวม": 8,
    "ก่าเกื้นจริยธรรม": 9
}

NUMERIC_TO_CATEGORY = {v: k for k, v in CATEGORY_MAPPING.items()}

ENGLISH_NAMES = {
    0: "Abuse of Power",
    1: "Budget/Project Fraud",
    2: "Procurement Fraud",
    3: "Fraudulent Land Title",
    4: "Embezzlement",
    5: "Personnel Misconduct",
    6: "Unusual Wealth",
    7: "Bribery",
    8: "Conflict of Interest",
    9: "Ethical Misconduct"
}

# Save mapping
mapping_file = os.path.join(OUTPUT_DIR, "category_mapping.json")
with open(mapping_file, 'w', encoding='utf-8') as f:
    json.dump({
        "thai_to_numeric": CATEGORY_MAPPING,
        "numeric_to_thai": NUMERIC_TO_CATEGORY,
        "english_names": ENGLISH_NAMES
    }, f, ensure_ascii=False, indent=2)

print(f"Category mapping saved to: {mapping_file}")

# =============================================================================
# 2. DATA LOADING AND PREPROCESSING
# =============================================================================

print("\n2. Loading and preprocessing data...")

# Load data
try:
    df = pd.read_csv(DATASET_PATH, encoding='utf-8')
    print(f"Loaded dataset with {len(df)} samples")
    print(f"Columns: {list(df.columns)}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Check data structure
print("\nData structure:")
print(df.head())
print(f"\nDataset shape: {df.shape}")

# Map categories to numeric
df['category_numeric'] = df['category'].map(CATEGORY_MAPPING)

# Check for unmapped categories
unmapped = df[df['category_numeric'].isna()]
if len(unmapped) > 0:
    print(f"Warning: {len(unmapped)} unmapped categories found:")
    print(unmapped['category'].unique())
    df = df.dropna(subset=['category_numeric'])

# Convert to int
df['category_numeric'] = df['category_numeric'].astype(int)

print(f"Final dataset size: {len(df)}")

# Extract features and labels
X = df['complaint'].values
y = df['category_numeric'].values

# Check class distribution
class_dist = df['category_numeric'].value_counts().sort_index()
print("\nClass distribution:")
for cat_id, count in class_dist.items():
    english_name = ENGLISH_NAMES[cat_id]
    percentage = (count / len(df)) * 100
    print(f"{cat_id}: {english_name} - {count} samples ({percentage:.1f}%)")

# =============================================================================
# 3. DATA AUGMENTATION FOR SMALL DATASET
# =============================================================================

print("\n3. Implementing data augmentation...")

def simple_back_translation_augment(text):
    """Simple augmentation through word shuffling and synonym replacement"""
    import random

    # Simple word shuffling (keeping sentence structure)
    sentences = text.split('.')
    augmented_sentences = []

    for sentence in sentences:
        words = sentence.strip().split()
        if len(words) > 3:
            # Shuffle middle words occasionally
            if random.random() < 0.3:
                middle = words[1:-1]
                random.shuffle(middle)
                words = [words[0]] + middle + [words[-1]]
        augmented_sentences.append(' '.join(words))

    return '.'.join(augmented_sentences)

def character_level_augment(text):
    """Character-level augmentation for Thai text"""
    import random

    chars = list(text)
    if len(chars) > 10:
        # Random character swapping (very conservative)
        if random.random() < 0.1:
            idx1, idx2 = random.sample(range(1, len(chars)-1), 2)
            chars[idx1], chars[idx2] = chars[idx2], chars[idx1]

    return ''.join(chars)

def augment_text(text, method='shuffle'):
    """Apply text augmentation"""
    if method == 'shuffle':
        return simple_back_translation_augment(text)
    elif method == 'char':
        return character_level_augment(text)
    else:
        return text

def create_augmented_data(X, y, augmentation_ratio=0.3):
    """Create augmented dataset"""
    print(f"Creating augmented data with ratio: {augmentation_ratio}")

    # Calculate samples to augment per class
    unique_classes, class_counts = np.unique(y, return_counts=True)

    X_aug = []
    y_aug = []

    for class_id in unique_classes:
        class_indices = np.where(y == class_id)[0]
        class_texts = X[class_indices]

        # Calculate how many samples to augment for this class
        current_count = len(class_indices)
        target_augment = int(current_count * augmentation_ratio)

        # Randomly select samples to augment
        aug_indices = np.random.choice(len(class_texts), size=target_augment, replace=True)

        for idx in aug_indices:
            original_text = class_texts[idx]

            # Apply different augmentation methods
            aug_method = np.random.choice(['shuffle', 'char'], p=[0.7, 0.3])
            augmented_text = augment_text(original_text, method=aug_method)

            # Only add if augmentation actually changed the text
            if augmented_text != original_text:
                X_aug.append(augmented_text)
                y_aug.append(class_id)

    print(f"Generated {len(X_aug)} augmented samples")
    return np.array(X_aug), np.array(y_aug)

# Apply data augmentation if enabled
if USE_AUGMENTATION and len(df) < 1000:  # Only for small datasets
    print("Dataset is small, applying data augmentation...")
    X_aug, y_aug = create_augmented_data(X, y, AUG_RATIO)

    # Combine original and augmented data
    X_combined = np.concatenate([X, X_aug])
    y_combined = np.concatenate([y, y_aug])

    print(f"Original dataset: {len(X)} samples")
    print(f"Augmented dataset: {len(X_combined)} samples")

    # Use combined data for training
    X, y = X_combined, y_combined
else:
    print("Skipping augmentation (dataset size sufficient or disabled)")

# =============================================================================
# 4. STRATIFIED TRAIN-TEST SPLIT WITH CROSS-VALIDATION
# =============================================================================

print("\n4. Creating stratified splits...")

# First, create a hold-out test set (20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Then split remaining data for train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2,  # 20% of remaining 80% = 16% of total
    stratify=y_temp,
    random_state=42
)

print(f"Train set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Validation set: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

# Check distributions
train_dist = pd.Series(y_train).value_counts().sort_index()
val_dist = pd.Series(y_val).value_counts().sort_index()
test_dist = pd.Series(y_test).value_counts().sort_index()

print("\nClass distributions:")
for cat_id in range(10):
    train_count = train_dist.get(cat_id, 0)
    val_count = val_dist.get(cat_id, 0)
    test_count = test_dist.get(cat_id, 0)
    print(f"  Class {cat_id}: Train={train_count}, Val={val_count}, Test={test_count}")

# =============================================================================
# 5. COMPUTE CLASS WEIGHTS FOR IMBALANCED DATA
# =============================================================================

print("\n5. Computing class weights...")

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print("Class weights:")
for cat_id, weight in class_weight_dict.items():
    if cat_id < len(ENGLISH_NAMES):
        print(f"  {cat_id} ({ENGLISH_NAMES[cat_id]}): {weight:.3f}")

# =============================================================================
# 6. CREATE DATASETS WITH VALIDATION
# =============================================================================

print("\n6. Creating datasets...")

# Create datasets
train_dataset = Dataset.from_dict({
    'text': X_train,
    'labels': y_train
})

val_dataset = Dataset.from_dict({
    'text': X_val,
    'labels': y_val
})

test_dataset = Dataset.from_dict({
    'text': X_test,
    'labels': y_test
})

# Create DatasetDict
datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print("Datasets created successfully")

# =============================================================================
# 7. MODEL AND TOKENIZER SETUP
# =============================================================================

print("\n7. Setting up model and tokenizer...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print("Tokenizer loaded")

# Setup quantization config with fallback
try:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )
    use_quantization = True
    print("4-bit quantization enabled")
except Exception as e:
    print(f"Quantization not available: {e}")
    print("Loading model without quantization...")
    quantization_config = None
    use_quantization = False

# Load model for classification
model_kwargs = {
    "num_labels": 10,  # Number of categories
    "device_map": "auto",
    "trust_remote_code": True,
    "pad_token_id": tokenizer.pad_token_id  # Add pad_token_id to model config
}

if use_quantization and quantization_config is not None:
    model_kwargs["quantization_config"] = quantization_config

try:
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        **model_kwargs
    )
except Exception as e:
    print(f"Error loading model with quantization: {e}")
    print("Retrying without quantization...")
    # Fallback without quantization
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        num_labels=10,
        trust_remote_code=True,
        pad_token_id=tokenizer.pad_token_id  # Add pad_token_id here too
    )

print("Base model loaded")

# Set pad_token_id in model config if not already set
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id
    print(f"Set model pad_token_id to: {model.config.pad_token_id}")

# Apply LoRA with stronger regularization for small datasets
peft_config = LoraConfig(
    r=LORA_R,           # Smaller rank reduces parameters
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,  # Higher dropout
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, peft_config)

# Freeze more layers for small dataset
if len(X_train) < 1000:
    print("Small dataset detected. Applying additional regularization...")

    # Freeze embedding layers
    for name, param in model.named_parameters():
        if 'embed' in name.lower():
            param.requires_grad = False

    # Freeze some transformer layers (freeze bottom layers, train top layers)
    total_layers = len([n for n, p in model.named_parameters() if 'layers.' in n and 'weight' in n])
    layers_to_freeze = total_layers // 3  # Freeze bottom 1/3 of layers

    for name, param in model.named_parameters():
        if 'layers.' in name:
            layer_num = int(name.split('layers.')[1].split('.')[0])
            if layer_num < layers_to_freeze:
                param.requires_grad = False

model.print_trainable_parameters()

# =============================================================================
# 8. TOKENIZATION
# =============================================================================

print("\n8. Tokenizing data...")

def tokenize_function(examples):
    """Tokenize complaints for classification"""
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors=None
    )

# Tokenize datasets
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

print("Tokenization completed")

# =============================================================================
# 9. EVALUATION METRICS
# =============================================================================

def compute_metrics(eval_pred):
    """Compute comprehensive evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Basic metrics
    accuracy = accuracy_score(labels, predictions)

    # Per-class metrics
    precision, recall, f1, support = precision_recall_fscore_support(
        labels, predictions, average=None, zero_division=0
    )

    # Macro and weighted averages
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(labels, predictions, average='weighted', zero_division=0)

    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'precision_macro': np.mean(precision),
        'recall_macro': np.mean(recall)
    }

# =============================================================================
# 10. TRAINING SETUP
# =============================================================================

print("\n10. Setting up training...")

# Custom trainer with advanced regularization for small datasets
class AdvancedRegularizedTrainer(Trainer):
    def __init__(self, class_weights=None, mixup_alpha=0.2, label_smoothing=0.1, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights
        self.mixup_alpha = mixup_alpha
        self.label_smoothing = label_smoothing

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')

        # Apply class weights
        if self.class_weights is not None:
            weight_tensor = torch.tensor(list(self.class_weights.values()),
                                       dtype=torch.float32, device=labels.device)
        else:
            weight_tensor = None

        # Label smoothing for regularization
        if self.label_smoothing > 0:
            loss_fct = torch.nn.CrossEntropyLoss(
                weight=weight_tensor,
                label_smoothing=self.label_smoothing
            )
        else:
            loss_fct = torch.nn.CrossEntropyLoss(weight=weight_tensor)

        loss = loss_fct(logits, labels)

        # Add L2 regularization to LoRA parameters
        l2_reg = 0
        for name, param in model.named_parameters():
            if 'lora_' in name and param.requires_grad:
                l2_reg += torch.norm(param, p=2)

        loss = loss + 0.01 * l2_reg  # L2 regularization coefficient

        return (loss, outputs) if return_outputs else loss

# Enhanced training arguments for small datasets
# Using eval_strategy instead of evaluation_strategy for compatibility
training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "checkpoints"),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=10,
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,
    report_to="none",
    dataloader_pin_memory=False,
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_available(),
    optim="adamw_torch",
    max_grad_norm=1.0,
    dataloader_drop_last=True,
    run_name=f"nacc-classification-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)

# Create trainer with advanced regularization
trainer = AdvancedRegularizedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use validation set
    compute_metrics=compute_metrics,
    class_weights=class_weight_dict,
    mixup_alpha=0.2 if len(X_train) < 1000 else 0.0,  # Enable mixup for small datasets
    label_smoothing=0.1,  # Label smoothing for regularization
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.001)
    ]
)

print("Trainer created successfully")

# =============================================================================
# 11. TRAINING
# =============================================================================

print("\n11. Starting training...")
print("=" * 50)

# Train the model
training_result = trainer.train()

print("Training completed!")
print(f"Final training loss: {training_result.training_loss:.4f}")

# =============================================================================
# 12. EVALUATION WITH FINAL TEST SET
# =============================================================================

print("\n12. Evaluating model on validation and test sets...")

# Evaluate on validation set (used during training)
val_results = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("Validation Results:")
for key, value in val_results.items():
    if key.startswith('eval_'):
        print(f"  {key}: {value:.4f}")

# Final evaluation on hold-out test set
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("\nFinal Test Results:")
for key, value in test_results.items():
    if key.startswith('eval_'):
        print(f"  {key}: {value:.4f}")

# Detailed predictions on test set
test_predictions = trainer.predict(tokenized_datasets["test"])
y_pred = np.argmax(test_predictions.predictions, axis=1)
y_true = test_predictions.label_ids

# Classification report
report = classification_report(
    y_true, y_pred,
    target_names=[ENGLISH_NAMES[i] for i in range(10)],
    output_dict=True,
    zero_division=0
)

print("\nDetailed Test Set Classification Report:")
print(classification_report(
    y_true, y_pred,
    target_names=[ENGLISH_NAMES[i] for i in range(10)],
    zero_division=0
))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# =============================================================================
# 13. CROSS-VALIDATION FOR ROBUST EVALUATION (OPTIONAL)
# =============================================================================

if USE_CROSS_VALIDATION and len(X_train) < 1000:
    print("\n13. Performing cross-validation for robust evaluation...")

    # Use original training + validation data for CV
    X_cv = np.concatenate([X_train, X_val])
    y_cv = np.concatenate([y_train, y_val])

    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=42)
    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_cv, y_cv)):
        print(f"Training fold {fold+1}/{CV_FOLDS}...")

        # Create fold datasets
        X_fold_train, X_fold_val = X_cv[train_idx], X_cv[val_idx]
        y_fold_train, y_fold_val = y_cv[train_idx], y_cv[val_idx]

        # Create datasets for this fold
        fold_train_dataset = Dataset.from_dict({
            'text': X_fold_train,
            'labels': y_fold_train
        })

        fold_val_dataset = Dataset.from_dict({
            'text': X_fold_val,
            'labels': y_fold_val
        })

        # Tokenize
        fold_train_tokenized = fold_train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
        fold_val_tokenized = fold_val_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

        # Create new model for this fold (reset weights)
        fold_model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_ID,
            num_labels=10,
            trust_remote_code=True,
            pad_token_id=tokenizer.pad_token_id  # Add pad_token_id
        )
        # Set pad_token_id in model config if not already set
        if fold_model.config.pad_token_id is None:
            fold_model.config.pad_token_id = tokenizer.pad_token_id
        fold_model = get_peft_model(fold_model, peft_config)

        # Create trainer for this fold
        fold_training_args = TrainingArguments(
            output_dir=os.path.join(OUTPUT_DIR, f"cv_fold_{fold}"),
            num_train_epochs=NUM_EPOCHS,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE,
            weight_decay=WEIGHT_DECAY,
            eval_strategy="no",  # Changed from evaluation_strategy
            save_strategy="no",
            logging_steps=1000,
            report_to="none",
            bf16=torch.cuda.is_available(),
        )

        fold_trainer = AdvancedRegularizedTrainer(
            model=fold_model,
            args=fold_training_args,
            train_dataset=fold_train_tokenized,
            eval_dataset=fold_val_tokenized,
            compute_metrics=compute_metrics,
            class_weights=class_weight_dict
        )

        # Train fold
        fold_trainer.train()

        # Evaluate fold
        fold_results = fold_trainer.evaluate()
        cv_scores.append(fold_results['eval_f1_macro'])

        print(f"Fold {fold+1} F1-Macro: {fold_results['eval_f1_macro']:.4f}")

        # Clean up memory
        del fold_model, fold_trainer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    print(f"\nCross-validation results:")
    print(f"Mean F1-Macro: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    print(f"Individual folds: {cv_scores}")

    # Add CV results to final results
    cv_results = {
        "cv_f1_macro_mean": np.mean(cv_scores),
        "cv_f1_macro_std": np.std(cv_scores),
        "cv_scores": cv_scores
    }
else:
    print("\n13. Skipping cross-validation")
    cv_results = {}

# =============================================================================
# 14. VISUALIZATION AND SAVING RESULTS
# =============================================================================

print("\n14. Saving results and visualizations...")

# Save comprehensive results
results = {
    "training_args": training_args.to_dict(),
    "model_config": {
        "base_model": MODEL_ID,
        "max_length": MAX_LENGTH,
        "lora_config": {
            "r": LORA_R,
            "alpha": LORA_ALPHA,
            "dropout": LORA_DROPOUT
        },
        "regularization": {
            "weight_decay": WEIGHT_DECAY,
            "label_smoothing": 0.1,
            "gradient_clipping": 1.0,
            "layer_freezing": len(X_train) < 1000
        }
    },
    "dataset_info": {
        "total_samples": len(X),
        "train_samples": len(X_train),
        "val_samples": len(X_val),
        "test_samples": len(X_test),
        "augmentation_applied": USE_AUGMENTATION,
        "augmentation_ratio": AUG_RATIO if USE_AUGMENTATION else 0
    },
    "training_results": {
        "final_loss": training_result.training_loss,
        "train_steps": training_result.global_step
    },
    "validation_results": val_results,
    "test_results": test_results,
    "classification_report": report,
    "confusion_matrix": cm.tolist(),
    "class_distribution": {
        "train": train_dist.to_dict(),
        "validation": val_dist.to_dict(),
        "test": test_dist.to_dict()
    },
    "class_weights": class_weight_dict,
    "cross_validation": cv_results,
    "timestamp": datetime.now().isoformat()
}

# Save results
results_file = os.path.join(OUTPUT_DIR, "training_results.json")
with open(results_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Results saved to: {results_file}")

# Create confusion matrix plot
plt.figure(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=[ENGLISH_NAMES[i] for i in range(10)],
    yticklabels=[ENGLISH_NAMES[i] for i in range(10)]
)
plt.title('Confusion Matrix - NACC Complaint Classification')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=300, bbox_inches='tight')
plt.close()

# Create class distribution plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Train distribution
train_dist.plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Training Set Class Distribution')
ax1.set_xlabel('Category ID')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)

# Test distribution
test_dist.plot(kind='bar', ax=ax2, color='lightcoral')
ax2.set_title('Test Set Class Distribution')
ax2.set_xlabel('Category ID')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "class_distribution.png"), dpi=300, bbox_inches='tight')
plt.close()

print(f"Visualizations saved to: {OUTPUT_DIR}")

# =============================================================================
# 15. SAVE MODEL
# =============================================================================

print("\n15. Saving trained model...")

# Save model and tokenizer
model_save_path = os.path.join(OUTPUT_DIR, "model")
tokenizer_save_path = os.path.join(OUTPUT_DIR, "tokenizer")

trainer.save_model(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model saved to: {model_save_path}")
print(f"Tokenizer saved to: {tokenizer_save_path}")

# =============================================================================
# 16. SAVE TEST SET WITH PREDICTIONS
# =============================================================================

print("\n16. Saving test set with predictions for future evaluation...")

# Create test set with predictions
test_df = pd.DataFrame({
    'complaint': X_test,
    'true_category': y_true,
    'predicted_category': y_pred,
    'true_category_thai': [NUMERIC_TO_CATEGORY[cat] for cat in y_true],
    'predicted_category_thai': [NUMERIC_TO_CATEGORY[cat] for cat in y_pred],
    'true_category_english': [ENGLISH_NAMES[cat] for cat in y_true],
    'predicted_category_english': [ENGLISH_NAMES[cat] for cat in y_pred],
    'correct_prediction': y_true == y_pred
})

test_csv_path = os.path.join(OUTPUT_DIR, "test_set_with_predictions.csv")
test_df.to_csv(test_csv_path, index=False, encoding='utf-8')

print(f"Test set with predictions saved to: {test_csv_path}")

# =============================================================================
# 17. FINAL SUMMARY
# =============================================================================

print("\n" + "=" * 80)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 80)

print(f"\nKey Results:")
print(f"  Accuracy: {test_results['eval_accuracy']:.3f}")
print(f"  Macro F1: {test_results['eval_f1_macro']:.3f}")
print(f"  Weighted F1: {test_results['eval_f1_weighted']:.3f}")

print(f"\nFiles saved to: {OUTPUT_DIR}")
print(f"  - Model: {model_save_path}")
print(f"  - Tokenizer: {tokenizer_save_path}")
print(f"  - Results: {results_file}")
print(f"  - Test set: {test_csv_path}")
print(f"  - Confusion matrix: confusion_matrix.png")
print(f"  - Class distribution: class_distribution.png")

print(f"\nModel ready for deployment and evaluation!")

# =============================================================================
# 18. INFERENCE FUNCTION (FOR TESTING)
# =============================================================================

def predict_complaint(text, model=model, tokenizer=tokenizer):
    """
    Predict the category of a complaint text

    Args:
        text: Input complaint text
        model: Trained model
        tokenizer: Tokenizer

    Returns:
        Dictionary with prediction results
    """
    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    # Move to device
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)

    # Get top prediction
    pred_idx = torch.argmax(probs, dim=-1).item()
    confidence = probs[0, pred_idx].item()

    # Get top 3 predictions
    top3_probs, top3_indices = torch.topk(probs[0], k=min(3, len(probs[0])))

    top3_predictions = []
    for prob, idx in zip(top3_probs, top3_indices):
        top3_predictions.append({
            "category_id": idx.item(),
            "category_thai": NUMERIC_TO_CATEGORY[idx.item()],
            "category_english": ENGLISH_NAMES[idx.item()],
            "confidence": prob.item()
        })

    return {
        "predicted_category_id": pred_idx,
        "predicted_category_thai": NUMERIC_TO_CATEGORY[pred_idx],
        "predicted_category_english": ENGLISH_NAMES[pred_idx],
        "confidence": confidence,
        "top3_predictions": top3_predictions
    }

# Test the inference function
print("\n" + "=" * 80)
print("TESTING INFERENCE FUNCTION")
print("=" * 80)

# Example test
test_text = "มีการเรียกรับเงินสินบนในการอนุมัติโครงการก่อสร้าง"
print(f"\nTest text: {test_text}")

result = predict_complaint(test_text)
print(f"\nPrediction: {result['predicted_category_thai']}")
print(f"English: {result['predicted_category_english']}")
print(f"Confidence: {result['confidence']:.2%}")

print("\nTop 3 predictions:")
for i, pred in enumerate(result['top3_predictions'], 1):
    print(f"  {i}. {pred['category_english']} ({pred['confidence']:.2%})")

print("\n" + "=" * 80)
print("ALL PROCESSES COMPLETED SUCCESSFULLY!")
print("=" * 80)

Installing required packages...


KeyboardInterrupt: 

In [1]:
# Improved Fine-tuning Code for NACC Complaint Classification

# Install required packages
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
print("Installing required packages...")
try:
    install_package("bitsandbytes")
    install_package("peft")
    install_package("accelerate")
    install_package("transformers>=4.35.0")
    install_package("datasets")
    install_package("scikit-learn")
    install_package("matplotlib")
    install_package("seaborn")
    print("All packages installed successfully!")
except Exception as e:
    print(f"Error installing packages: {e}")

import os
import json
import pandas as pd
import numpy as np
import torch
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================

# Paths
DATASET_PATH = "/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/data/Trainset.csv"
OUTPUT_DIR = "/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/finetune_new"

# Model configuration
MODEL_ID = "scb10x/llama3.2-typhoon2-3b-instruct"
MAX_LENGTH = 512  # Shorter sequences for efficiency
BATCH_SIZE = 8    # Larger batch size
GRADIENT_ACCUMULATION = 2
NUM_EPOCHS = 5    # More epochs for gradual learning
LEARNING_RATE = 1e-5  # Even lower learning rate
LORA_R = 8        # Smaller rank to reduce overfitting
LORA_ALPHA = 16   # Adjusted alpha
LORA_DROPOUT = 0.2  # Higher dropout
WEIGHT_DECAY = 0.1  # Strong regularization

# Data augmentation parameters
AUG_RATIO = 0.3   # 30% of data will be augmented
USE_AUGMENTATION = True
USE_CROSS_VALIDATION = True
CV_FOLDS = 5

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("=" * 80)
print("IMPROVED NACC COMPLAINT CLASSIFICATION FINE-TUNING")
print("=" * 80)

# =============================================================================
# 1. CATEGORY MAPPING
# =============================================================================

print("\n1. Setting up category mapping...")

CATEGORY_MAPPING = {
    "ปฏิบัติหรือละเว้นการปฏิบัติหน้าที่โดยมิชอบ": 0,
    "ทุจริตในการจัดทำงบประมาณ/โครงการ/เบิกจ่ายเงินในโครงการเป็นเท็จ": 1,
    "จัดซื้อจัดจ้าง": 2,
    "ออกเอกสารสิทธิที่ดิน": 3,
    "ยักยอก/เบียดบังเงินหรือทรัพย์สินของราชการ": 4,
    "การบริหารงานบุคคล (การบรรจุ/แต่งตั้ง/เลื่อนตำแหน่ง/โยกย้าย/ลงโทษวินัย)": 5,
    "ร่ำรวยผิดปกติ": 6,
    "เรียกรับสินบน": 7,
    "การขัดกันระหว่างประโยชน์ส่วนบุคคลกับประโยชน์ส่วนรวม": 8,
    "ก่าเกื้นจริยธรรม": 9
}

NUMERIC_TO_CATEGORY = {v: k for k, v in CATEGORY_MAPPING.items()}

ENGLISH_NAMES = {
    0: "Abuse of Power",
    1: "Budget/Project Fraud",
    2: "Procurement Fraud",
    3: "Fraudulent Land Title",
    4: "Embezzlement",
    5: "Personnel Misconduct",
    6: "Unusual Wealth",
    7: "Bribery",
    8: "Conflict of Interest",
    9: "Ethical Misconduct"
}

# Save mapping
mapping_file = os.path.join(OUTPUT_DIR, "category_mapping.json")
with open(mapping_file, 'w', encoding='utf-8') as f:
    json.dump({
        "thai_to_numeric": CATEGORY_MAPPING,
        "numeric_to_thai": NUMERIC_TO_CATEGORY,
        "english_names": ENGLISH_NAMES
    }, f, ensure_ascii=False, indent=2)

print(f"Category mapping saved to: {mapping_file}")

# =============================================================================
# 2. DATA LOADING AND PREPROCESSING
# =============================================================================

print("\n2. Loading and preprocessing data...")

# Load data
try:
    df = pd.read_csv(DATASET_PATH, encoding='utf-8')
    print(f"Loaded dataset with {len(df)} samples")
    print(f"Columns: {list(df.columns)}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Check data structure
print("\nData structure:")
print(df.head())
print(f"\nDataset shape: {df.shape}")

# Map categories to numeric
df['category_numeric'] = df['category'].map(CATEGORY_MAPPING)

# Check for unmapped categories
unmapped = df[df['category_numeric'].isna()]
if len(unmapped) > 0:
    print(f"Warning: {len(unmapped)} unmapped categories found:")
    print(unmapped['category'].unique())
    df = df.dropna(subset=['category_numeric'])

# Convert to int
df['category_numeric'] = df['category_numeric'].astype(int)

print(f"Final dataset size: {len(df)}")

# Extract features and labels
X = df['complaint'].values
y = df['category_numeric'].values

# Check class distribution
class_dist = df['category_numeric'].value_counts().sort_index()
print("\nClass distribution:")
for cat_id, count in class_dist.items():
    english_name = ENGLISH_NAMES[cat_id]
    percentage = (count / len(df)) * 100
    print(f"{cat_id}: {english_name} - {count} samples ({percentage:.1f}%)")

# =============================================================================
# 3. DATA AUGMENTATION FOR SMALL DATASET
# =============================================================================

print("\n3. Implementing data augmentation...")

def simple_back_translation_augment(text):
    """Simple augmentation through word shuffling and synonym replacement"""
    import random

    # Simple word shuffling (keeping sentence structure)
    sentences = text.split('.')
    augmented_sentences = []

    for sentence in sentences:
        words = sentence.strip().split()
        if len(words) > 3:
            # Shuffle middle words occasionally
            if random.random() < 0.3:
                middle = words[1:-1]
                random.shuffle(middle)
                words = [words[0]] + middle + [words[-1]]
        augmented_sentences.append(' '.join(words))

    return '.'.join(augmented_sentences)

def character_level_augment(text):
    """Character-level augmentation for Thai text"""
    import random

    chars = list(text)
    if len(chars) > 10:
        # Random character swapping (very conservative)
        if random.random() < 0.1:
            idx1, idx2 = random.sample(range(1, len(chars)-1), 2)
            chars[idx1], chars[idx2] = chars[idx2], chars[idx1]

    return ''.join(chars)

def augment_text(text, method='shuffle'):
    """Apply text augmentation"""
    if method == 'shuffle':
        return simple_back_translation_augment(text)
    elif method == 'char':
        return character_level_augment(text)
    else:
        return text

def create_augmented_data(X, y, augmentation_ratio=0.3):
    """Create augmented dataset"""
    print(f"Creating augmented data with ratio: {augmentation_ratio}")

    # Calculate samples to augment per class
    unique_classes, class_counts = np.unique(y, return_counts=True)

    X_aug = []
    y_aug = []

    for class_id in unique_classes:
        class_indices = np.where(y == class_id)[0]
        class_texts = X[class_indices]

        # Calculate how many samples to augment for this class
        current_count = len(class_indices)
        target_augment = int(current_count * augmentation_ratio)

        # Randomly select samples to augment
        aug_indices = np.random.choice(len(class_texts), size=target_augment, replace=True)

        for idx in aug_indices:
            original_text = class_texts[idx]

            # Apply different augmentation methods
            aug_method = np.random.choice(['shuffle', 'char'], p=[0.7, 0.3])
            augmented_text = augment_text(original_text, method=aug_method)

            # Only add if augmentation actually changed the text
            if augmented_text != original_text:
                X_aug.append(augmented_text)
                y_aug.append(class_id)

    print(f"Generated {len(X_aug)} augmented samples")
    return np.array(X_aug), np.array(y_aug)

# Apply data augmentation if enabled
if USE_AUGMENTATION and len(df) < 1000:  # Only for small datasets
    print("Dataset is small, applying data augmentation...")
    X_aug, y_aug = create_augmented_data(X, y, AUG_RATIO)

    # Combine original and augmented data
    X_combined = np.concatenate([X, X_aug])
    y_combined = np.concatenate([y, y_aug])

    print(f"Original dataset: {len(X)} samples")
    print(f"Augmented dataset: {len(X_combined)} samples")

    # Use combined data for training
    X, y = X_combined, y_combined
else:
    print("Skipping augmentation (dataset size sufficient or disabled)")

# =============================================================================
# 4. STRATIFIED TRAIN-TEST SPLIT WITH CROSS-VALIDATION
# =============================================================================

print("\n4. Creating stratified splits...")

# First, create a hold-out test set (20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Then split remaining data for train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2,  # 20% of remaining 80% = 16% of total
    stratify=y_temp,
    random_state=42
)

print(f"Train set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Validation set: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

# Check distributions
train_dist = pd.Series(y_train).value_counts().sort_index()
val_dist = pd.Series(y_val).value_counts().sort_index()
test_dist = pd.Series(y_test).value_counts().sort_index()

print("\nClass distributions:")
for cat_id in range(10):
    train_count = train_dist.get(cat_id, 0)
    val_count = val_dist.get(cat_id, 0)
    test_count = test_dist.get(cat_id, 0)
    print(f"  Class {cat_id}: Train={train_count}, Val={val_count}, Test={test_count}")

# =============================================================================
# 5. COMPUTE CLASS WEIGHTS FOR IMBALANCED DATA
# =============================================================================

print("\n5. Computing class weights...")

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print("Class weights:")
for cat_id, weight in class_weight_dict.items():
    if cat_id < len(ENGLISH_NAMES):
        print(f"  {cat_id} ({ENGLISH_NAMES[cat_id]}): {weight:.3f}")

# =============================================================================
# 6. CREATE DATASETS WITH VALIDATION
# =============================================================================

print("\n6. Creating datasets...")

# Create datasets
train_dataset = Dataset.from_dict({
    'text': X_train,
    'labels': y_train
})

val_dataset = Dataset.from_dict({
    'text': X_val,
    'labels': y_val
})

test_dataset = Dataset.from_dict({
    'text': X_test,
    'labels': y_test
})

# Create DatasetDict
datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print("Datasets created successfully")

# =============================================================================
# 7. MODEL AND TOKENIZER SETUP
# =============================================================================

print("\n7. Setting up model and tokenizer...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Fix padding token properly for LLaMA-based models
if tokenizer.pad_token is None:
    # For LLaMA models, we need to add a new pad token, not use eos_token
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Added [PAD] token to tokenizer")

print(f"Tokenizer loaded - pad_token: {tokenizer.pad_token}, pad_token_id: {tokenizer.pad_token_id}")

# Setup quantization config with fallback
try:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )
    use_quantization = True
    print("4-bit quantization enabled")
except Exception as e:
    print(f"Quantization not available: {e}")
    print("Loading model without quantization...")
    quantization_config = None
    use_quantization = False

# Load model for classification
model_kwargs = {
    "num_labels": 10,  # Number of categories
    "device_map": "auto",
    "trust_remote_code": True
}

if use_quantization and quantization_config is not None:
    model_kwargs["quantization_config"] = quantization_config

try:
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        **model_kwargs
    )
except Exception as e:
    print(f"Error loading model with quantization: {e}")
    print("Retrying without quantization...")
    # Fallback without quantization
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        num_labels=10,
        trust_remote_code=True
    )

print("Base model loaded")

# CRITICAL: Resize token embeddings if we added new tokens
if len(tokenizer) > model.config.vocab_size:
    model.resize_token_embeddings(len(tokenizer))
    print(f"Resized model embeddings to {len(tokenizer)} tokens")

# Set pad_token_id in model config
model.config.pad_token_id = tokenizer.pad_token_id
print(f"Set model pad_token_id to: {model.config.pad_token_id}")

# Apply LoRA with stronger regularization for small datasets
peft_config = LoraConfig(
    r=LORA_R,           # Smaller rank reduces parameters
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,  # Higher dropout
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

model = get_peft_model(model, peft_config)

# Freeze more layers for small dataset
if len(X_train) < 1000:
    print("Small dataset detected. Applying additional regularization...")

    # Freeze embedding layers
    for name, param in model.named_parameters():
        if 'embed' in name.lower():
            param.requires_grad = False

    # Freeze some transformer layers (freeze bottom layers, train top layers)
    total_layers = len([n for n, p in model.named_parameters() if 'layers.' in n and 'weight' in n])
    layers_to_freeze = total_layers // 3  # Freeze bottom 1/3 of layers

    for name, param in model.named_parameters():
        if 'layers.' in name:
            layer_num = int(name.split('layers.')[1].split('.')[0])
            if layer_num < layers_to_freeze:
                param.requires_grad = False

model.print_trainable_parameters()

# =============================================================================
# 8. TOKENIZATION
# =============================================================================

print("\n8. Tokenizing data...")

def tokenize_function(examples):
    """Tokenize complaints for classification"""
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',  # Use max_length padding instead of True
        max_length=MAX_LENGTH,
        return_tensors=None
    )

# Tokenize datasets
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

print("Tokenization completed")

# =============================================================================
# 9. EVALUATION METRICS
# =============================================================================

def compute_metrics(eval_pred):
    """Compute comprehensive evaluation metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Basic metrics
    accuracy = accuracy_score(labels, predictions)

    # Per-class metrics
    precision, recall, f1, support = precision_recall_fscore_support(
        labels, predictions, average=None, zero_division=0
    )

    # Macro and weighted averages
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(labels, predictions, average='weighted', zero_division=0)

    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'precision_macro': np.mean(precision),
        'recall_macro': np.mean(recall)
    }

# =============================================================================
# 10. TRAINING SETUP
# =============================================================================

print("\n10. Setting up training...")

# Custom trainer with advanced regularization for small datasets
class AdvancedRegularizedTrainer(Trainer):
    def __init__(self, class_weights=None, mixup_alpha=0.2, label_smoothing=0.1, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights
        self.mixup_alpha = mixup_alpha
        self.label_smoothing = label_smoothing

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')

        # Apply class weights
        if self.class_weights is not None:
            weight_tensor = torch.tensor(list(self.class_weights.values()),
                                       dtype=torch.float32, device=labels.device)
        else:
            weight_tensor = None

        # Label smoothing for regularization
        if self.label_smoothing > 0:
            loss_fct = torch.nn.CrossEntropyLoss(
                weight=weight_tensor,
                label_smoothing=self.label_smoothing
            )
        else:
            loss_fct = torch.nn.CrossEntropyLoss(weight=weight_tensor)

        loss = loss_fct(logits, labels)

        # Add L2 regularization to LoRA parameters
        l2_reg = 0
        for name, param in model.named_parameters():
            if 'lora_' in name and param.requires_grad:
                l2_reg += torch.norm(param, p=2)

        loss = loss + 0.01 * l2_reg  # L2 regularization coefficient

        return (loss, outputs) if return_outputs else loss

# Enhanced training arguments for small datasets
# Using eval_strategy instead of evaluation_strategy for compatibility
training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "checkpoints"),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=10,
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,
    report_to="none",
    dataloader_pin_memory=False,
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_available(),
    optim="adamw_torch",
    max_grad_norm=1.0,
    dataloader_drop_last=True,
    run_name=f"nacc-classification-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
)

# Create trainer with advanced regularization
trainer = AdvancedRegularizedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # Use validation set
    compute_metrics=compute_metrics,
    class_weights=class_weight_dict,
    mixup_alpha=0.2 if len(X_train) < 1000 else 0.0,  # Enable mixup for small datasets
    label_smoothing=0.1,  # Label smoothing for regularization
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.001)
    ]
)

print("Trainer created successfully")

# =============================================================================
# 11. TRAINING
# =============================================================================

print("\n11. Starting training...")
print("=" * 50)

# Train the model
training_result = trainer.train()

print("Training completed!")
print(f"Final training loss: {training_result.training_loss:.4f}")

# =============================================================================
# 12. EVALUATION WITH FINAL TEST SET
# =============================================================================

print("\n12. Evaluating model on validation and test sets...")

# Evaluate on validation set (used during training)
val_results = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("Validation Results:")
for key, value in val_results.items():
    if key.startswith('eval_'):
        print(f"  {key}: {value:.4f}")

# Final evaluation on hold-out test set
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("\nFinal Test Results:")
for key, value in test_results.items():
    if key.startswith('eval_'):
        print(f"  {key}: {value:.4f}")

# Detailed predictions on test set
test_predictions = trainer.predict(tokenized_datasets["test"])
y_pred = np.argmax(test_predictions.predictions, axis=1)
y_true = test_predictions.label_ids

# Classification report
report = classification_report(
    y_true, y_pred,
    target_names=[ENGLISH_NAMES[i] for i in range(10)],
    output_dict=True,
    zero_division=0
)

print("\nDetailed Test Set Classification Report:")
print(classification_report(
    y_true, y_pred,
    target_names=[ENGLISH_NAMES[i] for i in range(10)],
    zero_division=0
))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# =============================================================================
# 13. CROSS-VALIDATION FOR ROBUST EVALUATION (OPTIONAL)
# =============================================================================

if USE_CROSS_VALIDATION and len(X_train) < 1000:
    print("\n13. Performing cross-validation for robust evaluation...")

    # Use original training + validation data for CV
    X_cv = np.concatenate([X_train, X_val])
    y_cv = np.concatenate([y_train, y_val])

    skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=42)
    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_cv, y_cv)):
        print(f"Training fold {fold+1}/{CV_FOLDS}...")

        # Create fold datasets
        X_fold_train, X_fold_val = X_cv[train_idx], X_cv[val_idx]
        y_fold_train, y_fold_val = y_cv[train_idx], y_cv[val_idx]

        # Create datasets for this fold
        fold_train_dataset = Dataset.from_dict({
            'text': X_fold_train,
            'labels': y_fold_train
        })

        fold_val_dataset = Dataset.from_dict({
            'text': X_fold_val,
            'labels': y_fold_val
        })

        # Tokenize
        fold_train_tokenized = fold_train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
        fold_val_tokenized = fold_val_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

        # Create new model for this fold (reset weights)
        fold_model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_ID,
            num_labels=10,
            trust_remote_code=True
        )
        # Resize token embeddings if needed
        if len(tokenizer) > fold_model.config.vocab_size:
            fold_model.resize_token_embeddings(len(tokenizer))
        # Set pad_token_id
        fold_model.config.pad_token_id = tokenizer.pad_token_id
        fold_model = get_peft_model(fold_model, peft_config)

        # Create trainer for this fold
        fold_training_args = TrainingArguments(
            output_dir=os.path.join(OUTPUT_DIR, f"cv_fold_{fold}"),
            num_train_epochs=NUM_EPOCHS,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE,
            weight_decay=WEIGHT_DECAY,
            eval_strategy="no",  # Changed from evaluation_strategy
            save_strategy="no",
            logging_steps=1000,
            report_to="none",
            bf16=torch.cuda.is_available(),
        )

        fold_trainer = AdvancedRegularizedTrainer(
            model=fold_model,
            args=fold_training_args,
            train_dataset=fold_train_tokenized,
            eval_dataset=fold_val_tokenized,
            compute_metrics=compute_metrics,
            class_weights=class_weight_dict
        )

        # Train fold
        fold_trainer.train()

        # Evaluate fold
        fold_results = fold_trainer.evaluate()
        cv_scores.append(fold_results['eval_f1_macro'])

        print(f"Fold {fold+1} F1-Macro: {fold_results['eval_f1_macro']:.4f}")

        # Clean up memory
        del fold_model, fold_trainer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    print(f"\nCross-validation results:")
    print(f"Mean F1-Macro: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    print(f"Individual folds: {cv_scores}")

    # Add CV results to final results
    cv_results = {
        "cv_f1_macro_mean": np.mean(cv_scores),
        "cv_f1_macro_std": np.std(cv_scores),
        "cv_scores": cv_scores
    }
else:
    print("\n13. Skipping cross-validation")
    cv_results = {}

# =============================================================================
# 14. VISUALIZATION AND SAVING RESULTS
# =============================================================================

print("\n14. Saving results and visualizations...")

# Save comprehensive results
results = {
    "training_args": training_args.to_dict(),
    "model_config": {
        "base_model": MODEL_ID,
        "max_length": MAX_LENGTH,
        "lora_config": {
            "r": LORA_R,
            "alpha": LORA_ALPHA,
            "dropout": LORA_DROPOUT
        },
        "regularization": {
            "weight_decay": WEIGHT_DECAY,
            "label_smoothing": 0.1,
            "gradient_clipping": 1.0,
            "layer_freezing": len(X_train) < 1000
        }
    },
    "dataset_info": {
        "total_samples": len(X),
        "train_samples": len(X_train),
        "val_samples": len(X_val),
        "test_samples": len(X_test),
        "augmentation_applied": USE_AUGMENTATION,
        "augmentation_ratio": AUG_RATIO if USE_AUGMENTATION else 0
    },
    "training_results": {
        "final_loss": training_result.training_loss,
        "train_steps": training_result.global_step
    },
    "validation_results": val_results,
    "test_results": test_results,
    "classification_report": report,
    "confusion_matrix": cm.tolist(),
    "class_distribution": {
        "train": train_dist.to_dict(),
        "validation": val_dist.to_dict(),
        "test": test_dist.to_dict()
    },
    "class_weights": class_weight_dict,
    "cross_validation": cv_results,
    "timestamp": datetime.now().isoformat()
}

# Save results
results_file = os.path.join(OUTPUT_DIR, "training_results.json")
with open(results_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Results saved to: {results_file}")

# Create confusion matrix plot
plt.figure(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=[ENGLISH_NAMES[i] for i in range(10)],
    yticklabels=[ENGLISH_NAMES[i] for i in range(10)]
)
plt.title('Confusion Matrix - NACC Complaint Classification')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=300, bbox_inches='tight')
plt.close()

# Create class distribution plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Train distribution
train_dist.plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Training Set Class Distribution')
ax1.set_xlabel('Category ID')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)

# Test distribution
test_dist.plot(kind='bar', ax=ax2, color='lightcoral')
ax2.set_title('Test Set Class Distribution')
ax2.set_xlabel('Category ID')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "class_distribution.png"), dpi=300, bbox_inches='tight')
plt.close()

print(f"Visualizations saved to: {OUTPUT_DIR}")

# =============================================================================
# 15. SAVE MODEL
# =============================================================================

print("\n15. Saving trained model...")

# Save model and tokenizer
model_save_path = os.path.join(OUTPUT_DIR, "model")
tokenizer_save_path = os.path.join(OUTPUT_DIR, "tokenizer")

trainer.save_model(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model saved to: {model_save_path}")
print(f"Tokenizer saved to: {tokenizer_save_path}")

# =============================================================================
# 16. SAVE TEST SET WITH PREDICTIONS
# =============================================================================

print("\n16. Saving test set with predictions for future evaluation...")

# Create test set with predictions
test_df = pd.DataFrame({
    'complaint': X_test,
    'true_category': y_true,
    'predicted_category': y_pred,
    'true_category_thai': [NUMERIC_TO_CATEGORY[cat] for cat in y_true],
    'predicted_category_thai': [NUMERIC_TO_CATEGORY[cat] for cat in y_pred],
    'true_category_english': [ENGLISH_NAMES[cat] for cat in y_true],
    'predicted_category_english': [ENGLISH_NAMES[cat] for cat in y_pred],
    'correct_prediction': y_true == y_pred
})

test_csv_path = os.path.join(OUTPUT_DIR, "test_set_with_predictions.csv")
test_df.to_csv(test_csv_path, index=False, encoding='utf-8')

print(f"Test set with predictions saved to: {test_csv_path}")

# =============================================================================
# 17. FINAL SUMMARY
# =============================================================================

print("\n" + "=" * 80)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 80)

print(f"\nKey Results:")
print(f"  Accuracy: {test_results['eval_accuracy']:.3f}")
print(f"  Macro F1: {test_results['eval_f1_macro']:.3f}")
print(f"  Weighted F1: {test_results['eval_f1_weighted']:.3f}")

print(f"\nFiles saved to: {OUTPUT_DIR}")
print(f"  - Model: {model_save_path}")
print(f"  - Tokenizer: {tokenizer_save_path}")
print(f"  - Results: {results_file}")
print(f"  - Test set: {test_csv_path}")
print(f"  - Confusion matrix: confusion_matrix.png")
print(f"  - Class distribution: class_distribution.png")

print(f"\nModel ready for deployment and evaluation!")

# =============================================================================
# 18. INFERENCE FUNCTION (FOR TESTING)
# =============================================================================

def predict_complaint(text, model=model, tokenizer=tokenizer):
    """
    Predict the category of a complaint text

    Args:
        text: Input complaint text
        model: Trained model
        tokenizer: Tokenizer

    Returns:
        Dictionary with prediction results
    """
    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    # Move to device
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)

    # Get top prediction
    pred_idx = torch.argmax(probs, dim=-1).item()
    confidence = probs[0, pred_idx].item()

    # Get top 3 predictions
    top3_probs, top3_indices = torch.topk(probs[0], k=min(3, len(probs[0])))

    top3_predictions = []
    for prob, idx in zip(top3_probs, top3_indices):
        top3_predictions.append({
            "category_id": idx.item(),
            "category_thai": NUMERIC_TO_CATEGORY[idx.item()],
            "category_english": ENGLISH_NAMES[idx.item()],
            "confidence": prob.item()
        })

    return {
        "predicted_category_id": pred_idx,
        "predicted_category_thai": NUMERIC_TO_CATEGORY[pred_idx],
        "predicted_category_english": ENGLISH_NAMES[pred_idx],
        "confidence": confidence,
        "top3_predictions": top3_predictions
    }

# Test the inference function
print("\n" + "=" * 80)
print("TESTING INFERENCE FUNCTION")
print("=" * 80)

# Example test
test_text = "มีการเรียกรับเงินสินบนในการอนุมัติโครงการก่อสร้าง"
print(f"\nTest text: {test_text}")

result = predict_complaint(test_text)
print(f"\nPrediction: {result['predicted_category_thai']}")
print(f"English: {result['predicted_category_english']}")
print(f"Confidence: {result['confidence']:.2%}")

print("\nTop 3 predictions:")
for i, pred in enumerate(result['top3_predictions'], 1):
    print(f"  {i}. {pred['category_english']} ({pred['confidence']:.2%})")

print("\n" + "=" * 80)
print("ALL PROCESSES COMPLETED SUCCESSFULLY!")
print("=" * 80)

Installing required packages...
All packages installed successfully!
IMPROVED NACC COMPLAINT CLASSIFICATION FINE-TUNING

1. Setting up category mapping...
Category mapping saved to: /content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/finetune_new/category_mapping.json

2. Loading and preprocessing data...
Error loading dataset: [Errno 2] No such file or directory: '/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/data/Trainset.csv'


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Government/KPI/NACC AI Project/[00] NACC LLM/Paper Revision/data/Trainset.csv'