In [None]:
# Jigsaw - Agile Community Rules Classification
# Advanced BERT Fine-tuning Solution
# Target: 0.925+ AUC

import numpy as np
import pandas as pd
import torch
import warnings
import sys
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')
print("Starting Advanced BERT Solution...", flush=True)
print(f"PyTorch version: {torch.__version__}", flush=True)
print(f"CUDA available: {torch.cuda.is_available()}", flush=True)

# ============================================================================
# 1. LOAD DATA
# ============================================================================
print("\n" + "="*80, flush=True)
print("1. LOADING DATA", flush=True)
print("="*80, flush=True)

train_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')

print(f"Train shape: {train_df.shape}", flush=True)
print(f"Test shape: {test_df.shape}", flush=True)
print(f"Unique rules in train: {train_df['rule'].nunique()}", flush=True)
print(f"Target distribution: {train_df['rule_violation'].value_counts().to_dict()}", flush=True)

# ============================================================================
# 2. CREATE FORMATTED TEXT INPUT
# ============================================================================
print("\n" + "="*80, flush=True)
print("2. PREPARING STRUCTURED INPUT", flush=True)
print("="*80, flush=True)

def create_input_text(row):
    """Create structured input that includes rule context and examples"""
    text = f"""Rule: {row['rule']}

Examples that VIOLATE this rule:
- {row['positive_example_1']}
- {row['positive_example_2']}

Examples that DO NOT violate this rule:
- {row['negative_example_1']}
- {row['negative_example_2']}

Comment to evaluate: {row['body']}"""
    
    return text

def create_simple_input(row):
    """Simpler format for faster processing"""
    return f"Rule: {row['rule']} | Positive: {row['positive_example_1']} {row['positive_example_2']} | Negative: {row['negative_example_1']} {row['negative_example_2']} | Comment: {row['body']}"

# Create both formats
print("Creating structured inputs...", flush=True)
train_df['input_text'] = train_df.apply(create_simple_input, axis=1)
test_df['input_text'] = test_df.apply(create_simple_input, axis=1)

print(f"Sample input:\n{train_df['input_text'].iloc[0][:200]}...", flush=True)

# ============================================================================
# 3. TRANSFORMER SETUP
# ============================================================================
print("\n" + "="*80, flush=True)
print("3. SETTING UP TRANSFORMER MODEL", flush=True)
print("="*80, flush=True)

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset

# Choose model - using DeBERTa-v3 for best performance
MODEL_NAME = 'microsoft/deberta-v3-small'  # Fast and effective
# Alternative: 'microsoft/deberta-v3-base' for better performance but slower
# Alternative: 'bert-base-uncased' for baseline

print(f"Loading model: {MODEL_NAME}", flush=True)

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print("✓ Tokenizer loaded", flush=True)
except Exception as e:
    print(f"Error loading tokenizer: {e}", flush=True)
    print("Falling back to bert-base-uncased", flush=True)
    MODEL_NAME = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ============================================================================
# 4. CREATE DATASET
# ============================================================================

class RuleViolationDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return item

print("Creating datasets...", flush=True)

# Tokenize to check lengths
sample_lengths = []
for text in train_df['input_text'].head(100):
    tokens = tokenizer(text, truncation=False)
    sample_lengths.append(len(tokens['input_ids']))

print(f"Token length stats - Mean: {np.mean(sample_lengths):.0f}, Max: {np.max(sample_lengths):.0f}", flush=True)

# Set max length based on data
MAX_LENGTH = min(512, int(np.percentile(sample_lengths, 95)) + 50)
print(f"Using max_length: {MAX_LENGTH}", flush=True)

# ============================================================================
# 5. TRAINING WITH CROSS-VALIDATION
# ============================================================================
print("\n" + "="*80, flush=True)
print("4. TRAINING WITH CROSS-VALIDATION", flush=True)
print("="*80, flush=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.softmax(torch.tensor(predictions), dim=-1)[:, 1].numpy()
    auc = roc_auc_score(labels, predictions)
    return {'auc': auc}

# Training configuration
EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
N_FOLDS = 5

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
oof_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['rule_violation'])):
    print(f"\n{'='*80}", flush=True)
    print(f"FOLD {fold + 1}/{N_FOLDS}", flush=True)
    print(f"{'='*80}", flush=True)
    
    # Prepare data
    train_texts = train_df.iloc[train_idx]['input_text'].values
    train_labels = train_df.iloc[train_idx]['rule_violation'].values
    val_texts = train_df.iloc[val_idx]['input_text'].values
    val_labels = train_df.iloc[val_idx]['rule_violation'].values
    
    train_dataset = RuleViolationDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
    val_dataset = RuleViolationDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
    
    # Initialize model
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2,
        problem_type="single_label_classification"
    )
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE * 2,
        learning_rate=LEARNING_RATE,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="auc",
        greater_is_better=True,
        report_to="none",
        fp16=torch.cuda.is_available(),
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    
    # Train
    print(f"Training fold {fold + 1}...", flush=True)
    trainer.train()
    
    # Validation predictions
    val_pred = trainer.predict(val_dataset)
    val_probs = torch.softmax(torch.tensor(val_pred.predictions), dim=-1)[:, 1].numpy()
    val_auc = roc_auc_score(val_labels, val_probs)
    
    print(f"Fold {fold + 1} Validation AUC: {val_auc:.4f}", flush=True)
    fold_scores.append(val_auc)
    oof_predictions[val_idx] = val_probs
    
    # Test predictions
    test_dataset = RuleViolationDataset(test_df['input_text'].values, None, tokenizer, MAX_LENGTH)
    test_pred = trainer.predict(test_dataset)
    test_probs = torch.softmax(torch.tensor(test_pred.predictions), dim=-1)[:, 1].numpy()
    test_predictions += test_probs / N_FOLDS
    
    # Clean up
    del model, trainer, train_dataset, val_dataset, test_dataset
    gc.collect()
    torch.cuda.empty_cache()

# ============================================================================
# 6. RESULTS AND SUBMISSION
# ============================================================================
print("\n" + "="*80, flush=True)
print("5. CROSS-VALIDATION RESULTS", flush=True)
print("="*80, flush=True)

overall_auc = roc_auc_score(train_df['rule_violation'], oof_predictions)
print(f"\nFold AUC scores: {[f'{s:.4f}' for s in fold_scores]}", flush=True)
print(f"Mean Fold AUC: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})", flush=True)
print(f"Overall OOF AUC: {overall_auc:.4f}", flush=True)

print("\n" + "="*80, flush=True)
print("6. CREATING SUBMISSION", flush=True)
print("="*80, flush=True)

submission = pd.DataFrame({
    'row_id': test_df['row_id'].values,
    'rule_violation': np.clip(test_predictions, 0.001, 0.999)
})

submission.to_csv('submission.csv', index=False)

print(f"\n✓ Submission created successfully!", flush=True)
print(f"\nPrediction statistics:", flush=True)
print(f"  Min: {submission['rule_violation'].min():.4f}", flush=True)
print(f"  Max: {submission['rule_violation'].max():.4f}", flush=True)
print(f"  Mean: {submission['rule_violation'].mean():.4f}", flush=True)
print(f"  Median: {submission['rule_violation'].median():.4f}", flush=True)

print("\nFirst 10 predictions:", flush=True)
print(submission.head(10), flush=True)

print("\n" + "="*80, flush=True)
print("TRAINING COMPLETE!", flush=True)
print(f"Expected Public LB Score: ~{overall_auc:.4f}", flush=True)
print("="*80, flush=True)