In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from pathlib import Path
import torch
from datasets import load_from_disk
from transformers import (
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    AutoTokenizer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import gc

# --- Configuration ---
ROBERTA_MODEL_NAME = "roberta-base"

# --- Kaggle Specific Paths ---
OUTPUT_DIR = "/kaggle/working/results"
LOGGING_DIR = "/kaggle/working/logs"
KAGGLE_INPUT_PATH = "/kaggle/input/northernth-sentiment-analysis-dataset"

# Create directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGGING_DIR, exist_ok=True)

# --- Check GPU availability ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

def load_data_safely():
    """Load datasets and tokenizer with error handling"""
    try:
        print("Loading prepared data for RoBERTa training...")
        
        # Load datasets
        train_dataset = load_from_disk(f"{KAGGLE_INPUT_PATH}/train_tokenized_dataset")
        val_dataset = load_from_disk(f"{KAGGLE_INPUT_PATH}/val_tokenized_dataset")
        test_dataset = load_from_disk(f"{KAGGLE_INPUT_PATH}/test_tokenized_dataset")
        
        print(f"Train dataset size: {len(train_dataset)}")
        print(f"Validation dataset size: {len(val_dataset)}")
        print(f"Test dataset size: {len(test_dataset)}")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(f"{KAGGLE_INPUT_PATH}/tokenizer")
        
        # Load label encoder
        with open(f"{KAGGLE_INPUT_PATH}/label_encoder.pkl", "rb") as f:
            label_encoder = pickle.load(f)
        
        num_labels = len(label_encoder.classes_)
        print(f"Number of sentiment labels: {num_labels}")
        print(f"Label classes: {label_encoder.classes_}")
        
        return train_dataset, val_dataset, test_dataset, tokenizer, label_encoder, num_labels
        
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        print("Please check if all required files exist in the input directory:")
        print("- train_tokenized_dataset/")
        print("- val_tokenized_dataset/")  
        print("- test_tokenized_dataset/")
        print("- tokenizer/")
        print("- label_encoder.pkl")
        raise

def compute_metrics(p):
    """Compute evaluation metrics during training"""
    predictions = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        p.label_ids, predictions, average='weighted', zero_division=0
    )
    acc = accuracy_score(p.label_ids, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def get_optimal_batch_size():
    """Determine optimal batch size based on available GPU memory"""
    if not torch.cuda.is_available():
        return 8, 16  # train_batch_size, eval_batch_size
    
    gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
    
    if gpu_memory_gb >= 15:  # P100 or better
        return 16, 32
    elif gpu_memory_gb >= 12:  # T4
        return 12, 24
    else:  # Smaller GPU
        return 8, 16

def main():
    # Load data
    train_dataset, val_dataset, test_dataset, tokenizer, label_encoder, num_labels = load_data_safely()
    
    # Get optimal batch sizes
    train_batch_size, eval_batch_size = get_optimal_batch_size()
    print(f"Using batch sizes - Train: {train_batch_size}, Eval: {eval_batch_size}")
    
    # Calculate warmup steps (10% of total training steps)
    total_steps = len(train_dataset) // train_batch_size * 3  # 3 epochs
    warmup_steps = int(0.1 * total_steps)
    
    print(f"Total training steps: {total_steps}")
    print(f"Warmup steps: {warmup_steps}")
    
    # --- Initialize Model ---
    print(f"Loading pre-trained RoBERTa model: {ROBERTA_MODEL_NAME}...")
    
    try:
        model = AutoModelForSequenceClassification.from_pretrained(
            ROBERTA_MODEL_NAME, 
            num_labels=num_labels,
            problem_type="single_label_classification"
        )
        print("Model loaded successfully!")
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise
    
    # --- Set Training Arguments --- (FIXED)
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=3,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        warmup_steps=warmup_steps,
        weight_decay=0.01,
        learning_rate=2e-5,  # Standard learning rate for RoBERTa
        
        # Logging and evaluation (FIXED: evaluation_strategy -> eval_strategy)
        logging_dir=LOGGING_DIR,
        logging_steps=50,
        eval_strategy="steps",  # Changed from evaluation_strategy
        eval_steps=200,  # Evaluate more frequently
        
        # Saving strategy
        save_strategy="steps",
        save_steps=200,
        save_total_limit=2,  # Keep only 2 best models to save space
        
        # Best model selection
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        
        # Performance optimizations
        dataloader_num_workers=2,
        fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
        gradient_accumulation_steps=1,
        
        # Reproducibility
        seed=42,
        
        # Reporting
        report_to=["tensorboard"],
        
        # Memory optimization
        dataloader_pin_memory=torch.cuda.is_available(),
        remove_unused_columns=False,
    )
    
    # --- Create Trainer with Early Stopping ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    # --- Train the Model ---
    print("\n" + "="*50)
    print("Starting model training...")
    print("="*50)
    
    try:
        # Clear cache before training
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
        # Train the model
        trainer.train()
        print("\nModel training completed successfully!")
        
    except Exception as e:
        print(f"Error during training: {str(e)}")
        # Save current state in case of error
        trainer.save_model(f"{OUTPUT_DIR}/checkpoint_error")
        raise
    
    # --- Evaluate on Test Set ---
    print("\n" + "="*50)
    print("Evaluating model on the test set...")
    print("="*50)
    
    try:
        test_results = trainer.evaluate(test_dataset)
        print("\nTest set evaluation results:")
        for key, value in test_results.items():
            if isinstance(value, float):
                print(f"{key}: {value:.4f}")
            else:
                print(f"{key}: {value}")
        
        # Get detailed classification report
        test_predictions = trainer.predict(test_dataset)
        test_pred_labels = np.argmax(test_predictions.predictions, axis=1)
        test_true_labels = test_predictions.label_ids
        
        # Convert back to original labels
        test_pred_names = label_encoder.inverse_transform(test_pred_labels)
        test_true_names = label_encoder.inverse_transform(test_true_labels)
        
        print("\nDetailed Classification Report:")
        print(classification_report(test_true_names, test_pred_names))
        
    except Exception as e:
        print(f"Error during evaluation: {str(e)}")
    
    # --- Save Final Model ---
    try:
        final_model_path = f"{OUTPUT_DIR}/final_model"
        trainer.save_model(final_model_path)
        tokenizer.save_pretrained(final_model_path)
        
        # Save label encoder with the model
        with open(f"{final_model_path}/label_encoder.pkl", "wb") as f:
            pickle.dump(label_encoder, f)
        
        print(f"\nFinal model saved to: {final_model_path}")
        print("Model files saved:")
        print("- pytorch_model.bin")
        print("- config.json")
        print("- tokenizer files")
        print("- label_encoder.pkl")
        
    except Exception as e:
        print(f"Error saving final model: {str(e)}")
    
    # --- Memory cleanup ---
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    print("\n" + "="*50)
    print("Training pipeline completed!")
    print("="*50)

if __name__ == "__main__":
    main()

2025-06-20 03:55:04.507206: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750391704.707396      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750391704.769219      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda
GPU: Tesla T4
GPU Memory: 14.7 GB
Loading prepared data for RoBERTa training...
Train dataset size: 37039
Validation dataset size: 5291
Test dataset size: 10583
Number of sentiment labels: 3
Label classes: ['negative' 'neutral' 'positive']
Using batch sizes - Train: 12, Eval: 24
Total training steps: 9258
Warmup steps: 925
Loading pre-trained RoBERTa model: roberta-base...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Model loaded successfully!

Starting model training...




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
200,0.744,0.634617,0.691363,0.629306,0.664278,0.691363
400,0.5096,0.486199,0.803818,0.803866,0.804021,0.803818
600,0.4643,0.490482,0.805519,0.802626,0.806317,0.805519
800,0.4929,0.492716,0.802684,0.792919,0.796779,0.802684





Model training completed successfully!

Evaluating model on the test set...





Test set evaluation results:
eval_loss: 0.4750
eval_accuracy: 0.8032
eval_f1: 0.8032
eval_precision: 0.8031
eval_recall: 0.8032
eval_runtime: 153.3904
eval_samples_per_second: 68.9940
eval_steps_per_second: 1.4410
epoch: 0.5181





Detailed Classification Report:
              precision    recall  f1-score   support

    negative       0.74      0.74      0.74      1835
     neutral       0.69      0.69      0.69      3242
    positive       0.89      0.89      0.89      5506

    accuracy                           0.80     10583
   macro avg       0.77      0.77      0.77     10583
weighted avg       0.80      0.80      0.80     10583


Final model saved to: /kaggle/working/results/final_model
Model files saved:
- pytorch_model.bin
- config.json
- tokenizer files
- label_encoder.pkl

Training pipeline completed!
