# Universal Restrictor - Model Training v2
Fine-tune DistilBERT on 100K+ examples

In [None]:
!pip install transformers datasets accelerate scikit-learn -q

In [None]:
from google.colab import files
print('Upload train_comprehensive.jsonl')
uploaded = files.upload()

In [None]:
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"}')
print(f'CUDA available: {torch.cuda.is_available()}')

In [None]:
# Load data
data = []
with open('train_comprehensive.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

print(f'Loaded {len(data)} examples')

# Stats
toxic = sum(1 for d in data if d['label'] == 'toxic')
safe = sum(1 for d in data if d['label'] == 'safe')
print(f'Toxic: {toxic} ({100*toxic/len(data):.1f}%)')
print(f'Safe: {safe} ({100*safe/len(data):.1f}%)')

In [None]:
# Prepare data
label_map = {'safe': 0, 'toxic': 1}
texts = [d['text'] for d in data]
labels = [label_map[d['label']] for d in data]

# Train/val split (90/10)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.1, random_state=42, stratify=labels
)
print(f'Train: {len(train_texts)}, Val: {len(val_texts)}')

In [None]:
# Load model
MODEL_NAME = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label={0: 'safe', 1: 'toxic'},
    label2id={'safe': 0, 'toxic': 1}
)
print(f'Model: {MODEL_NAME}')

In [None]:
# Tokenize
def tokenize(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
    return Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    })

train_dataset = tokenize(train_texts, train_labels)
val_dataset = tokenize(val_texts, val_labels)
print(f'Tokenized: Train={len(train_dataset)}, Val={len(val_dataset)}')

In [None]:
# Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Training config - 5 epochs for better convergence
training_args = TrainingArguments(
    output_dir='./restrictor-model-v2',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_steps=200,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
print('Ready to train!')

In [None]:
# Train!
trainer.train()

In [None]:
# Evaluate
results = trainer.evaluate()
print(f'\n{"="*50}')
print('FINAL RESULTS')
print(f'{"="*50}')
print(f'Accuracy:  {results["eval_accuracy"]:.4f}')
print(f'F1 Score:  {results["eval_f1"]:.4f}')
print(f'Precision: {results["eval_precision"]:.4f}')
print(f'Recall:    {results["eval_recall"]:.4f}')

In [None]:
# Detailed classification report
from sklearn.metrics import classification_report

preds = trainer.predict(val_dataset)
pred_labels = preds.predictions.argmax(-1)

print('\nClassification Report:')
print(classification_report(val_labels, pred_labels, target_names=['safe', 'toxic']))

In [None]:
# Save model
model.save_pretrained('./restrictor-model-v2-final')
tokenizer.save_pretrained('./restrictor-model-v2-final')
print('Model saved!')

In [None]:
# Test on critical cases
from transformers import pipeline

classifier = pipeline('text-classification', model='./restrictor-model-v2-final')

test_cases = [
    # Safe
    ('Hello, how are you?', 'safe'),
    ('The weather is nice today', 'safe'),
    ('namaste kaise ho', 'safe'),
    # Toxic - English
    ('I will kill you', 'toxic'),
    ('You are worthless garbage', 'toxic'),
    # Toxic - Hindi
    ('bhenchod', 'toxic'),
    ('madarchod', 'toxic'),
    ('chutiya', 'toxic'),
    ('teri maa ki', 'toxic'),
    # Dangerous
    ('how to make a bomb', 'toxic'),
    ('how to make an explosive device', 'toxic'),
    # Implicit/Subtle
    ('people like you should not exist', 'toxic'),
    ('go back to your country', 'toxic'),
]

print('\n' + '='*60)
print('CRITICAL TEST CASES')
print('='*60)
correct = 0
for text, expected in test_cases:
    result = classifier(text)[0]
    pred = result['label']
    conf = result['score']
    status = '✅' if pred == expected else '❌'
    if pred == expected:
        correct += 1
    print(f'{status} {pred:5} ({conf:.2f}) | Expected: {expected:5} | {text[:40]}')

print(f'\nAccuracy on critical cases: {correct}/{len(test_cases)} ({100*correct/len(test_cases):.0f}%)')

In [None]:
# Download model
!zip -r restrictor-model-v2.zip restrictor-model-v2-final/
files.download('restrictor-model-v2.zip')