In [1]:
import pandas as pd
import numpy as np
import os
import json
from pathlib import Path
import torch
from datasets import load_from_disk
from transformers import (
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.utils.class_weight import compute_class_weight
import gc

2025-07-06 09:31:56.660585: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751794316.867501      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751794316.928563      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# --- Configuration ---
ROBERTA_MODEL_NAME = "roberta-base"

# --- Paths ---
OUTPUT_DIR = "./results"
LOGGING_DIR = "./logs"
INPUT_DATA_PATH = "/kaggle/input/northernth-sentiment-analysis-dataset"

# Create directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGGING_DIR, exist_ok=True)

# --- Check GPU availability ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

def load_prepared_data():
    """Loads datasets, tokenizer, and label mappings from JSON files."""
    try:
        print("Loading prepared data...")
        train_dataset = load_from_disk(f"{INPUT_DATA_PATH}/train_tokenized_dataset")
        val_dataset = load_from_disk(f"{INPUT_DATA_PATH}/val_tokenized_dataset")
        test_dataset = load_from_disk(f"{INPUT_DATA_PATH}/test_tokenized_dataset")
        
        tokenizer = AutoTokenizer.from_pretrained(f"{INPUT_DATA_PATH}/tokenizer")
        
        with open(f"{INPUT_DATA_PATH}/label2id.json", "r") as f:
            label2id = json.load(f)
        with open(f"{INPUT_DATA_PATH}/id2label.json", "r") as f:
            id2label_str_keys = json.load(f)
            id2label = {int(k): v for k, v in id2label_str_keys.items()}
        
        num_labels = len(label2id)
        print(f"Number of sentiment labels: {num_labels}")
        
        return train_dataset, val_dataset, test_dataset, tokenizer, label2id, id2label, num_labels
        
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        print(f"Please check if all required files exist in '{INPUT_DATA_PATH}'")
        raise

def calculate_class_weights(train_dataset, device):
    """Calculates class weights from the training dataset to handle imbalance."""
    y_train = train_dataset['label']
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    print("\nCalculated class weights to handle imbalance:")
    print(weights_tensor)
    return weights_tensor

class CustomTrainer(Trainer):
    """Custom Trainer to apply class weights during loss computation."""
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(p):
    """Compute evaluation metrics."""
    predictions = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        p.label_ids, predictions, average='weighted', zero_division=0
    )
    acc = accuracy_score(p.label_ids, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

def get_optimal_batch_size():
    """Determine optimal batch size based on GPU memory."""
    if not torch.cuda.is_available():
        return 8, 16
    gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
    if gpu_memory_gb >= 15:
        return 16, 32
    elif gpu_memory_gb >= 12:
        return 12, 24
    else:
        return 8, 16

def main():
    # Load data
    train_dataset, val_dataset, test_dataset, tokenizer, label2id, id2label, num_labels = load_prepared_data()
    
    # --- Data Integrity Check and Cleanup ---
    print("\n--- Running Data Integrity Check ---")
    original_train_size = len(train_dataset)
    train_dataset = train_dataset.filter(lambda example: example['label'] is not None, keep_in_memory=True)
    if len(train_dataset) < original_train_size:
        print(f"Cleaned training data: Removed {original_train_size - len(train_dataset)} rows with invalid labels.")

    original_val_size = len(val_dataset)
    val_dataset = val_dataset.filter(lambda example: example['label'] is not None, keep_in_memory=True)
    if len(val_dataset) < original_val_size:
        print(f"Cleaned validation data: Removed {original_val_size - len(val_dataset)} rows with invalid labels.")
    print("--- Data Integrity Check Complete ---")

    # --- Post-cleanup Validation ---
    if len(train_dataset) == 0:
        raise ValueError("The training dataset is empty after filtering for valid labels.")

    # Get batch sizes and calculate class weights
    train_batch_size, eval_batch_size = get_optimal_batch_size()
    class_weights = calculate_class_weights(train_dataset, device)
    
    # Calculate warmup steps
    total_steps = len(train_dataset) // train_batch_size * 3
    warmup_steps = int(0.1 * total_steps)
    
    # Initialize Model
    model = AutoModelForSequenceClassification.from_pretrained(
        ROBERTA_MODEL_NAME, 
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )
    
    # Set Training Arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=3,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        seed=42,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_steps=warmup_steps,
        eval_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        logging_dir=LOGGING_DIR,
        logging_steps=100,
        report_to=["tensorboard"],
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=2,
    )
    
    # Create Trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        class_weights=class_weights
    )
    
    # Train
    print("\n" + "="*50 + "\nStarting model training...\n" + "="*50)
    trainer.train()
    print("\nModel training completed successfully!")
    
    # --- Evaluate on Test Set ---
    print("\n" + "="*50 + "\nEvaluating model on the test set...\n" + "="*50)
    
    # --- HOTFIX: Tokenize the test set on the fly ---
    # This is necessary because the test set was not correctly processed in the previous script.
    print("Applying on-the-fly tokenization to the test set...")
    def tokenize_function(examples):
        # The test set from the previous script still has 'cleaned_text'
        return tokenizer(examples["cleaned_text"], truncation=True, padding="max_length", max_length=512)

    # Make sure the test_dataset has the 'label' column.
    if 'labels_id' in test_dataset.column_names:
        test_dataset = test_dataset.rename_column('labels_id', 'label')

    tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, keep_in_memory=True)
    
    # Remove all columns that are not expected by the model
    model_input_columns = tokenizer.model_input_names + ["label"]
    columns_to_remove = [col for col in tokenized_test_dataset.column_names if col not in model_input_columns]
    tokenized_test_dataset = tokenized_test_dataset.remove_columns(columns_to_remove)
    print("On-the-fly tokenization complete.")
    # --- END HOTFIX ---
    
    # Use the newly tokenized dataset for evaluation and prediction
    test_results = trainer.evaluate(tokenized_test_dataset)
    print("\nTest set evaluation results:")
    for key, value in test_results.items():
        print(f"  {key}: {value:.4f}")
    
    # Detailed report
    test_predictions = trainer.predict(tokenized_test_dataset)
    test_pred_labels_ids = np.argmax(test_predictions.predictions, axis=1)
    target_names = [id2label[i] for i in range(num_labels)]
    print("\nDetailed Classification Report (Test Set):")
    print(classification_report(test_predictions.label_ids, test_pred_labels_ids, target_names=target_names, zero_division=0))
    
    # Save final model
    final_model_path = f"{OUTPUT_DIR}/final_model"
    trainer.save_model(final_model_path)
    
    print(f"\nFinal model and configurations saved to: {final_model_path}")
    
    # Cleanup
    del model, trainer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    print("\n" + "="*50 + "\nTraining pipeline completed!\n" + "="*50)


Using device: cuda
GPU: Tesla T4


In [3]:
if __name__ == "__main__":
    main()

Loading prepared data...
Number of sentiment labels: 5

--- Running Data Integrity Check ---


Filter:   0%|          | 0/15501 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2214 [00:00<?, ? examples/s]

--- Data Integrity Check Complete ---

Calculated class weights to handle imbalance:
tensor([1.0001, 1.0001, 0.9997, 1.0001, 1.0001], device='cuda:0')


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting model training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.9433,0.928648,0.594851,0.597557,0.625518,0.594851
1000,0.8021,0.934724,0.610208,0.597529,0.603083,0.610208
1500,0.7124,0.913779,0.633695,0.636261,0.642827,0.633695



Model training completed successfully!

Evaluating model on the test set...
Applying on-the-fly tokenization to the test set...


Map:   0%|          | 0/4430 [00:00<?, ? examples/s]

On-the-fly tokenization complete.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Test set evaluation results:
  eval_loss: 0.8573
  eval_accuracy: 0.6391
  eval_f1: 0.6409
  eval_precision: 0.6457
  eval_recall: 0.6391
  eval_runtime: 72.9581
  eval_samples_per_second: 60.7200
  eval_steps_per_second: 1.2750
  epoch: 3.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Detailed Classification Report (Test Set):
               precision    recall  f1-score   support

Very Negative       0.76      0.68      0.72       886
     Negative       0.55      0.55      0.55       886
      Neutral       0.60      0.60      0.60       886
     Positive       0.57      0.67      0.62       886
Very Positive       0.75      0.69      0.72       886

     accuracy                           0.64      4430
    macro avg       0.65      0.64      0.64      4430
 weighted avg       0.65      0.64      0.64      4430


Final model and configurations saved to: ./results/final_model

Training pipeline completed!
