In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import torch
from transformers import Trainer

# Remove cached module
if 'models' in sys.modules:
    del sys.modules['models']
if 'data_loader' in sys.modules:
    del sys.modules['data_loader']

# Fresh imports
from data_loader import ANLIDataLoader
from models import TransformerNLI
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"MPS available: {torch.backends.mps.is_available()}")

In [None]:
# Load data
loader = ANLIDataLoader()
train_df, dev_df, test_df = loader.load_data()

# Prepare data
train_premises, train_hypotheses = loader.prepare_text_pairs(train_df)
dev_premises, dev_hypotheses = loader.prepare_text_pairs(dev_df)
train_labels = train_df['label'].values
dev_labels = dev_df['label'].values

print(f"\nTrain samples: {len(train_labels)}")
print(f"Dev samples: {len(dev_labels)}")

## BERT Models

In [None]:
import os
os.environ['WANDB_DISABLED'] = 'true'
from transformers import Trainer, EarlyStoppingCallback

# Results tracking
all_results = []

# Models to train
models_config = [
    ('bert-base-uncased', 'BERT', '../models/bert_anli'),
    ('roberta-base', 'RoBERTa', '../models/roberta_anli'),
    ('microsoft/deberta-v3-base', 'DeBERTa-v3', '../models/deberta_anli'),
]

In [None]:
for model_name, display_name, output_dir in models_config:
    print("\n" + "="*60)
    print(f"TRAINING {display_name.upper()} MODEL")
    print("="*60)

    # Load model
    model = TransformerNLI(model_name=model_name, num_labels=3)
    print(f"\nModel loaded on: {model.device}")

    # Tokenize data
    print("\nTokenizing data...")
    train_encodings = model.tokenize_data(
        train_premises,
        train_hypotheses,
        train_labels,
        max_length=128
    )
    dev_encodings = model.tokenize_data(
        dev_premises,
        dev_hypotheses,
        dev_labels,
        max_length=128
    )
    print(" Tokenization complete")

    # Create datasets
    train_dataset = model.create_dataset(train_encodings)
    dev_dataset = model.create_dataset(dev_encodings)
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Dev dataset size: {len(dev_dataset)}")

    # Training arguments
    training_args = model.get_training_args(
        output_dir=output_dir,
        num_epochs=5,  # More epochs with early stopping
        batch_size=16,
        learning_rate=2e-5
    )
    training_args.report_to = []  # Disable wandb

    print("Training configuration:")
    print(f"  Epochs: {training_args.num_train_epochs}")
    print(f"  Batch size: {training_args.per_device_train_batch_size}")
    print(f"  Learning rate: {training_args.learning_rate}")

    # Initialize trainer with early stopping
    trainer = Trainer(
        model=model.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=model.compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    print("✓ Trainer initialized")
    print(f"Model device: {model.device}")
    print(f"Trainer device: {trainer.model.device}")

    # Train model
    print(f"\nStarting {display_name} training...")
    print("Training with early stopping (patience=2)...\n")

    trainer.train()

    print(f"\n✓ {display_name} training complete!")

    # Evaluate on dev set
    print(f"\nEvaluating {display_name} on dev set...")
    eval_results = trainer.evaluate()

    print(f"\n{display_name} Dev Results:")
    for key, value in eval_results.items():
        print(f"  {key}: {value:.4f}")

    # Save model
    model.model.save_pretrained(f'{output_dir}/final')
    model.tokenizer.save_pretrained(f'{output_dir}/final')
    print(f"\n✓ Model saved to {output_dir}/final")

    # Store results
    all_results.append({
        'Model': display_name,
        'Model Name': model_name,
        'Dev Accuracy': eval_results['eval_accuracy'],
        'Dev F1': eval_results['eval_f1'],
        'Dev Loss': eval_results['eval_loss']
    })

    print(f"\n{'='*60}")
    print(f"✅ {display_name} COMPLETED!")
    print(f"{'='*60}\n")

In [None]:
print("\n" + "="*60)
print(" MODEL COMPARISON")
print("="*60)

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('Dev Accuracy', ascending=False)

print("\n", results_df.to_string(index=False))

# Save comparison
results_df.to_csv('../results/all_models_comparison.csv', index=False)
print("\n Results saved to ../results/all_models_comparison.csv")

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Accuracy comparison
axes[0].bar(results_df['Model'], results_df['Dev Accuracy'],
            color=['#3498db', '#2ecc71', '#e74c3c', '#f39c12'])
axes[0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Accuracy')
axes[0].set_xlabel('Model')
axes[0].grid(axis='y', alpha=0.3)
for i, v in enumerate(results_df['Dev Accuracy']):
    axes[0].text(i, v + 0.005, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')

# F1 comparison
axes[1].bar(results_df['Model'], results_df['Dev F1'],
            color=['#3498db', '#2ecc71', '#e74c3c', '#f39c12'])
axes[1].set_title('Model F1 Score Comparison', fontsize=14, fontweight='bold')
axes[1].set_ylabel('F1 Score')
axes[1].set_xlabel('Model')
axes[1].grid(axis='y', alpha=0.3)
for i, v in enumerate(results_df['Dev F1']):
    axes[1].text(i, v + 0.005, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('../results/all_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n Comparison plot saved to ../results/all_models_comparison.png")

# Find best model
best_model = results_df.iloc[0]
print(f"\n BEST MODEL: {best_model['Model']}")
print(f"   Accuracy: {best_model['Dev Accuracy']:.4f}")
print(f"   F1 Score: {best_model['Dev F1']:.4f}")

print("\n" + "="*60)
print("ALL TRAINING COMPLETE!")
print("="*60)
print(f"\nTotal models trained: {len(all_results)}")
print(f"Best performing model: {best_model['Model']}")
print("\nNext step: Run evaluation notebook with the best model!")

## Model Comparison

In [None]:
# Visualization
plt.figure(figsize=(10, 6))
bars = plt.bar(results_df['Model'], results_df['Dev Accuracy'], color=['#3498db', '#2ecc71'])
plt.ylabel('Accuracy', fontsize=12)
plt.title('Model Performance Comparison on ANLI R2', fontsize=14, fontweight='bold')
plt.ylim([0, 1])
plt.grid(axis='y', alpha=0.3)

for i, v in enumerate(results_df['Dev Accuracy']):
    plt.text(i, v + 0.02, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('../results/plots/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Comparison plot saved")