# MoE v2 - Merged Categories
7 categories: safe, harassment, harmful_content, sexual, hate_speech, hindi_abuse, self_harm

In [None]:
!pip install transformers datasets accelerate scikit-learn -q

In [None]:
from google.colab import files
import os

print('Upload router_train_v2.jsonl')
uploaded = files.upload()

In [None]:
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"}')

In [None]:
# Load router data
router_data = []
with open('router_train_v2.jsonl', 'r') as f:
    for line in f:
        router_data.append(json.loads(line))

print(f'Total: {len(router_data)}')

# Categories
from collections import Counter
cats = Counter(d['label'] for d in router_data)
for cat, count in cats.most_common():
    print(f'  {cat}: {count}')

In [None]:
# Prepare data
categories = sorted(set(d['label'] for d in router_data))
label2id = {cat: i for i, cat in enumerate(categories)}
id2label = {i: cat for i, cat in enumerate(categories)}

print(f'Categories ({len(categories)}): {categories}')

texts = [d['text'] for d in router_data]
labels = [label2id[d['label']] for d in router_data]

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.1, random_state=42, stratify=labels
)
print(f'Train: {len(train_texts)}, Val: {len(val_texts)}')

In [None]:
# Load model
MODEL_NAME = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(categories),
    id2label=id2label,
    label2id=label2id
)

In [None]:
# Tokenize
def tokenize(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
    return Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    })

train_dataset = tokenize(train_texts, train_labels)
val_dataset = tokenize(val_texts, val_labels)

In [None]:
# Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {'accuracy': accuracy_score(labels, preds), 'f1': f1, 'precision': precision, 'recall': recall}

In [None]:
# Train - 5 epochs for better convergence
args = TrainingArguments(
    output_dir='./router_v2',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_steps=500,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# Evaluate
results = trainer.evaluate()
print(f'\nRouter v2 Results:')
print(f'  Accuracy: {results["eval_accuracy"]:.4f}')
print(f'  F1: {results["eval_f1"]:.4f}')

preds = trainer.predict(val_dataset)
pred_labels = preds.predictions.argmax(-1)
print('\nClassification Report:')
print(classification_report(val_labels, pred_labels, target_names=categories))

In [None]:
# Save
import os
os.makedirs('moe_models_v2', exist_ok=True)
model.save_pretrained('moe_models_v2/router')
tokenizer.save_pretrained('moe_models_v2/router')
print('Router v2 saved!')

In [None]:
# Test critical cases
from transformers import pipeline
classifier = pipeline('text-classification', model='moe_models_v2/router')

test_cases = [
    ('Hello, how are you?', 'safe'),
    ('I will kill you', 'harmful_content'),
    ('how to make a bomb', 'harmful_content'),
    ('bhenchod', 'hindi_abuse'),
    ('madarchod', 'hindi_abuse'),
    ('you are worthless garbage', 'harassment'),
    ('go back to your country', 'hate_speech'),
    ('I want to kill myself', 'self_harm'),
    ('send nudes', 'sexual'),
]

print('\nCritical Test Cases:')
print('='*70)
correct = 0
for text, expected in test_cases:
    result = classifier(text)[0]
    pred = result['label']
    conf = result['score']
    status = '✅' if pred == expected else '❌'
    if pred == expected: correct += 1
    print(f"{status} {pred:<18} ({conf:.2f}) | exp: {expected:<18} | {text[:30]}")

print(f'\nAccuracy: {correct}/{len(test_cases)} ({100*correct/len(test_cases):.0f}%)')

In [None]:
# Download router only for now
!zip -r router_v2.zip moe_models_v2/router/
files.download('router_v2.zip')