# Imports and setups

In [None]:
import os
import json
import torch
import numpy as np
import pandas as pd

os.environ["WANDB_DISABLED"] = "true"

import warnings
warnings.filterwarnings("ignore", message=".*tokenizer.*deprecated.*")
warnings.filterwarnings("ignore", message=".*FutureWarning.*")
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
import logging

from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorForTokenClassification,
    TrainerCallback
)
from datasets import Dataset
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

import sys
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout),
    ],
    force=True
)
logger = logging.getLogger(__name__)

import transformers
transformers.logging.set_verbosity_info()
transformers.logging.enable_default_handler()
transformers.logging.enable_explicit_format()


from tqdm.auto import tqdm
tqdm.pandas()


sys.stdout.flush()
sys.stderr.flush()

DATA_PATH = "/kaggle/input/bert-token-classif"
OUTPUT_PATH = "/kaggle/working"


print(f"Data path: {DATA_PATH}")
print(f"Output path: {OUTPUT_PATH}")

# Config

In [None]:
@dataclass
class TrainingConfig:
    """Training configuration parameters."""
    
    model_name: str = "distilbert-base-uncased"
    max_length: int = 512
    
    num_epochs: int = 3
    batch_size: int = 16
    learning_rate: float = 2e-5
    weight_decay: float = 0.01
    warmup_steps: int = 500
    
    train_split: float = 0.8
    val_split: float = 0.1
    test_split: float = 0.1
    max_samples: Optional[int] = None
    
    save_model: bool = True
    evaluate_on_test: bool = True

config = TrainingConfig()

print("Training Configuration:")
for key, value in config.__dict__.items():
    print(f"  {key}: {value}")

# Data loading

In [None]:
def load_processed_data(data_path: str) -> Tuple[List[Dict], Dict[str, int], Dict[int, str]]:
    """
    Load processed dataset and label mappings.
    
    Returns:
        Tuple of (examples, label2id, id2label)
    """
    with open(f"{data_path}/processed_examples.json", 'r', encoding='utf-8') as f:
        examples = json.load(f)
    
    with open(f"{data_path}/label_mappings.json", 'r', encoding='utf-8') as f:
        mappings = json.load(f)
    
    label2id = mappings['label2id']
    id2label = mappings['id2label']
    id2label = {int(k): v for k, v in id2label.items()}
    
    logger.info(f"Loaded {len(examples)} examples")
    logger.info(f"Number of labels: {len(label2id)}")
    logger.info(f"Labels: {sorted(label2id.keys())}")
    
    return examples, label2id, id2label

examples, label2id, id2label = load_processed_data(DATA_PATH)

if config.max_samples:
    examples = examples[:config.max_samples]
    logger.info(f"Limited to {len(examples)} examples for testing")

print(f"\nDataset Statistics:")
print(f"Total examples: {len(examples)}")
print(f"Number of unique labels: {len(label2id)}")


# Create dataset

In [None]:
def split_data(examples: List[Dict], train_split: float, val_split: float, test_split: float):
    """Split data into train/val/test sets."""
    
    assert abs(train_split + val_split + test_split - 1.0) < 1e-6, "Splits must sum to 1.0"
    
    n_total = len(examples)
    n_train = int(n_total * train_split)
    n_val = int(n_total * val_split)
    
    train_examples = examples[:n_train]
    val_examples = examples[n_train:n_train + n_val]
    test_examples = examples[n_train + n_val:]
    
    logger.info(f"Data split: Train={len(train_examples)}, Val={len(val_examples)}, Test={len(test_examples)}")
    
    return train_examples, val_examples, test_examples

def create_dataset(examples: List[Dict], tokenizer, label2id: Dict[str, int], max_length: int):
    """
    Create HuggingFace Dataset from processed examples.
    """
    
    def tokenize_and_align_labels(examples_batch):
        """Tokenize and align labels for a batch of examples."""
        
        tokenized_inputs = tokenizer(
            [ex['tokens'] for ex in examples_batch],
            truncation=True,
            padding=True,
            max_length=max_length,
            is_split_into_words=True,
            return_tensors="pt"
        )
        
        labels = []
        for i, example in enumerate(examples_batch):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            label_ids = []
            
            previous_word_idx = None
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    if word_idx < len(example['labels']):
                        label = example['labels'][word_idx]
                        label_ids.append(label2id.get(label, label2id['O']))
                    else:
                        label_ids.append(label2id['O'])
                else:
                    label_ids.append(-100)
                
                previous_word_idx = word_idx
            
            labels.append(label_ids)
        
        tokenized_inputs["labels"] = labels
        return tokenized_inputs
    
    dataset_dict = {
        'input_ids': [],
        'attention_mask': [],
        'labels': []
    }
    
    batch_size = 100
    for i in range(0, len(examples), batch_size):
        batch = examples[i:i + batch_size]
        tokenized = tokenize_and_align_labels(batch)
        
        dataset_dict['input_ids'].extend(tokenized['input_ids'].tolist())
        dataset_dict['attention_mask'].extend(tokenized['attention_mask'].tolist())
        dataset_dict['labels'].extend(tokenized['labels'])
    
    return Dataset.from_dict(dataset_dict)

train_examples, val_examples, test_examples = split_data(
    examples, config.train_split, config.val_split, config.test_split
)

tokenizer = AutoTokenizer.from_pretrained(config.model_name)

print("Creating datasets...")
train_dataset = create_dataset(train_examples, tokenizer, label2id, config.max_length)
val_dataset = create_dataset(val_examples, tokenizer, label2id, config.max_length)
test_dataset = create_dataset(test_examples, tokenizer, label2id, config.max_length)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Initialize model

In [None]:
def initialize_model(model_name: str, num_labels: int, label2id: Dict[str, int], id2label: Dict[int, str]):
    """Initialize the token classification model."""
    
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    
    logger.info(f"Initialized model: {model_name}")
    logger.info(f"Number of parameters: {model.num_parameters():,}")
    
    return model

model = initialize_model(
    config.model_name,
    len(label2id),
    label2id,
    id2label
)

print(f"Model initialized with {len(label2id)} labels")
print(f"Model parameters: {model.num_parameters():,}")


# Training setup

In [None]:
def compute_metrics(eval_pred):
    """
    Compute metrics for evaluation.
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    flat_true_labels = [item for sublist in true_labels for item in sublist]
    flat_predictions = [item for sublist in true_predictions for item in sublist]
    
    report = classification_report(
        flat_true_labels, 
        flat_predictions, 
        output_dict=True,
        zero_division=0
    )
    
    metrics = {
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1': report['macro avg']['f1-score'],
        'accuracy': report['accuracy']
    }
    
    return metrics

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir=f"{OUTPUT_PATH}/bert_pii_model",
    num_train_epochs=config.num_epochs,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    warmup_steps=config.warmup_steps,
    weight_decay=config.weight_decay,
    learning_rate=config.learning_rate,
    logging_dir=f"{OUTPUT_PATH}/logs",
    logging_steps=50,
    logging_first_step=True,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to=[],
    dataloader_pin_memory=False,
    disable_tqdm=False,
    log_level="info",
    logging_nan_inf_filter=False,
    log_on_each_node=False,
)

print("Training arguments configured:")
print(f"  Epochs: {config.num_epochs}")
print(f"  Batch size: {config.batch_size}")
print(f"  Learning rate: {config.learning_rate}")
print(f"  Output directory: {training_args.output_dir}")


In [10]:
class ProgressCallback(TrainerCallback):
    """Custom callback to display training progress."""
    
    def __init__(self):
        self.training_start_time = None
        self.last_log_time = None
        self.in_kaggle = 'KAGGLE_WORKING_DIR' in os.environ
        
    def on_train_begin(self, args, state, control, **kwargs):
        import time
        self.training_start_time = time.time()
        print("=" * 80)
        print("TRAINING STARTED!")
        print("=" * 80)
        logger.info(f"Total steps: {state.max_steps}")
        logger.info(f"📈otal epochs: {args.num_train_epochs}")
        logger.info(f"Batch size: {args.per_device_train_batch_size}")
        print("=" * 80)
        
    def on_epoch_begin(self, args, state, control, **kwargs):
        epoch_num = int(state.epoch) + 1
        print(f"\nEPOCH {epoch_num}/{args.num_train_epochs} STARTED")
        print("-" * 60)
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            step = state.global_step
            max_steps = state.max_steps
            progress = (step / max_steps) * 100 if max_steps > 0 else 0
            
            bar_length = 40
            filled_length = int(bar_length * step // max_steps) if max_steps > 0 else 0
            bar = '█' * filled_length + '░' * (bar_length - filled_length)
            
            if self.in_kaggle and step % (args.logging_steps * 2) == 0:
                try:
                    from IPython.display import clear_output
                    clear_output(wait=True)
                    print("BERT PII Training Progress")
                    print("=" * 50)
                except:
                    pass
            
            print(f"\nStep {step:4d}/{max_steps} [{bar}] {progress:.1f}%")
            
            if "loss" in logs:
                print(f"   Loss: {logs['loss']:.4f}")
            if "learning_rate" in logs:
                print(f"   LR: {logs['learning_rate']:.2e}")
                
            if "eval_loss" in logs:
                print(f"   Eval Loss: {logs['eval_loss']:.4f}")
            if "eval_f1" in logs:
                print(f"   F1 Score: {logs['eval_f1']:.4f}")
            if "eval_precision" in logs:
                print(f"   Precision: {logs['eval_precision']:.4f}")
            if "eval_recall" in logs:
                print(f"   Recall: {logs['eval_recall']:.4f}")
            
            if self.in_kaggle:
                import time
                current_time = time.strftime("%H:%M:%S")
                print(f"   ⏰ Time: {current_time}")
                
            import sys
            sys.stdout.flush()
                
    def on_evaluate(self, args, state, control, **kwargs):
        print(f"\n📊 Evaluation completed at step {state.global_step}")
        
    def on_save(self, args, state, control, **kwargs):
        print(f"💾 Model checkpoint saved at step {state.global_step}")
        
    def on_train_end(self, args, state, control, **kwargs):
        import time
        if self.training_start_time:
            total_time = time.time() - self.training_start_time
            hours = int(total_time // 3600)
            minutes = int((total_time % 3600) // 60)
            seconds = int(total_time % 60)
            print("\n" + "=" * 80)
            print("TRAINING COMPLETED!")
            print(f"Total training time: {hours:02d}:{minutes:02d}:{seconds:02d}")
            print("=" * 80)


In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[ProgressCallback()],
)

print("Trainer initialized. Starting training...")

print("\nTesting display capabilities...")
print("  Basic print works")
logger.info("   Logger works")

if True:
    try:
        from IPython.display import clear_output, display
        print("   IPython display available")
    except ImportError:
        print("   IPython display not available")

print("   Progress bar test:")
test_bar = '█' * 10 + '░' * 30
print(f"      [{test_bar}] 25.0%")

print("\nTraining configuration:")
print(f"   Dataset size: {len(train_dataset)} train, {len(val_dataset)} validation")
print(f"   Batch size: {config.batch_size}")
print(f"   Learning rate: {config.learning_rate}")
print(f"   Epochs: {config.num_epochs}")
print(f"   Logging every {training_args.logging_steps} steps")
print(f"   Evaluation every {training_args.eval_steps} steps")
print(f"   Environment: {'Kaggle Notebook' if True else 'Local'}")
print(f"   Native tqdm: {'Disabled' if training_args.disable_tqdm else 'Enabled'}")

try:
    training_result = trainer.train()
    logger.info("Training completed successfully!")
except Exception as e:
    logger.error(f"Training failed: {e}")
    raise

print("Training completed!")
print(f"Training loss: {training_result.training_loss:.4f}")

if config.save_model:
    trainer.save_model(f"{OUTPUT_PATH}/final_model")
    tokenizer.save_pretrained(f"{OUTPUT_PATH}/final_model")
    print(f"Model saved to {OUTPUT_PATH}/final_model")

In [None]:
if config.evaluate_on_test:
    print("Evaluating on test set...")
    
    test_results = trainer.evaluate(test_dataset)
    
    print("Test Results:")
    for key, value in test_results.items():
        if key.startswith('eval_'):
            print(f"  {key[5:]}: {value:.4f}")