In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string
import numpy as np

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ TEXT CLEANING
# ================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_cleaned.csv', index=False)

# ================================================
# ✅ 5️⃣ LOAD XLM-RoBERTa MODEL
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load XLM-RoBERTa model and tokenizer
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
xlm_roberta_model = AutoModel.from_pretrained(model_name).to(device)

# ================================================
# ✅ 6️⃣ TEXT-ONLY DATASET
# ================================================
class TextOnlyDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row['Captions'])
        label = int(row['label'])
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# ================================================
# ✅ 7️⃣ DATALOADERS
# ================================================
batch_size = 16

train_dataset = TextOnlyDataset(train_df, tokenizer)
val_dataset = TextOnlyDataset(val_df, tokenizer)
test_dataset = TextOnlyDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ================================================
# ✅ 8️⃣ SENTIMENT CLASSIFIER MODEL
# ================================================
class XLMRobertaSentimentClassifier(torch.nn.Module):
    def __init__(self, xlm_roberta_model, num_classes=3, dropout_rate=0.3):
        super().__init__()
        self.xlm_roberta = xlm_roberta_model
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.classifier = torch.nn.Linear(xlm_roberta_model.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.xlm_roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token representation
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        
        return logits

# ================================================
# ✅ 9️⃣ INITIALIZE MODEL
# ================================================
model = XLMRobertaSentimentClassifier(xlm_roberta_model).to(device)

# ================================================
# ✅ 🔟 LOSS & OPTIMIZER
# ================================================
# Calculate class weights for balanced training
class_counts = train_df['label'].value_counts().sort_index()
total_samples = len(train_df)
class_weights = [total_samples / count for count in class_counts]
print(f"Class distribution: {class_counts.to_dict()}")
print(f"Class weights: {class_weights}")

criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# ================================================
# ✅ 1️⃣1️⃣ TRAINING LOOP
# ================================================
num_epochs = 10
patience = 3
patience_counter = 0
best_val_loss = float('inf')

print(f"Starting training for {num_epochs} epochs...")
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")

for epoch in range(num_epochs):
    # ============================================================
    # TRAINING PHASE
    # ============================================================
    model.train()
    total_train_loss = 0
    train_predictions = []
    train_labels = []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        
        # Store predictions for metrics
        predictions = torch.argmax(logits, dim=1)
        train_predictions.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_predictions)

    # ============================================================
    # VALIDATION PHASE
    # ============================================================
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            total_val_loss += loss.item()
            
            # Store predictions for metrics
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"  Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f}")
    print(f"  Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    # ============================================================
    # EARLY STOPPING CHECK
    # ============================================================
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_xlm_roberta_model.pt")
        print("✅ Validation loss improved — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break

# ================================================
# ✅ 1️⃣2️⃣ FINAL TEST EVALUATION
# ================================================
print("\n🔍 Loading best model for final evaluation...")
model.load_state_dict(torch.load("best_xlm_roberta_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        total_test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Calculate final metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
cm = confusion_matrix(test_labels, test_predictions)

# Calculate per-class metrics
precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(
    test_labels, test_predictions, average=None
)

print("\n📊 FINAL TEST RESULTS:")
print("="*50)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision (Weighted): {precision:.4f}")
print(f"Test Recall (Weighted): {recall:.4f}")
print(f"Test F1-Score (Weighted): {f1:.4f}")
print(f"Test Loss: {total_test_loss/len(test_loader):.4f}")

print(f"\nConfusion Matrix:")
print(cm)

print(f"\nPer-Class Metrics:")
print("-" * 30)
for i in range(len(precision_per_class)):
    print(f"Class {i}:")
    print(f"  Precision: {precision_per_class[i]:.4f}")
    print(f"  Recall: {recall_per_class[i]:.4f}")
    print(f"  F1-Score: {f1_per_class[i]:.4f}")
    print(f"  Support: {support_per_class[i]}")

# Save results to file
results = {
    'model': 'XLM-RoBERTa (Text Only)',
    'test_accuracy': test_accuracy,
    'test_precision_weighted': precision,
    'test_recall_weighted': recall,
    'test_f1_weighted': f1,
    'test_loss': total_test_loss/len(test_loader),
    'confusion_matrix': cm.tolist(),
    'per_class_precision': precision_per_class.tolist(),
    'per_class_recall': recall_per_class.tolist(),
    'per_class_f1': f1_per_class.tolist(),
    'per_class_support': support_per_class.tolist()
}

import json
with open('/kaggle/working/xlm_roberta_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\n✅ Results saved to 'xlm_roberta_results.json'")
print("🎯 Text-only sentiment analysis with XLM-RoBERTa completed!")

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2025-07-07 07:27:09.923652: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751873230.156720      18 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751873230.225278      18 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Class distribution: {0: 1404, 1: 1237, 2: 515}
Class weights: [2.247863247863248, 2.551333872271625, 6.128155339805825]
Starting training for 10 epochs...
Training samples: 3156
Validation samples: 451
Test samples: 902


Train Epoch 1: 100%|██████████| 198/198 [00:47<00:00,  4.19it/s]
Validation Epoch 1: 100%|██████████| 29/29 [00:01<00:00, 17.11it/s]


Epoch [1/10]
  Train Loss: 1.1471 | Train Acc: 0.3710
  Val Loss: 1.0114 | Val Acc: 0.4789
✅ Validation loss improved — model saved.


Train Epoch 2: 100%|██████████| 198/198 [00:46<00:00,  4.27it/s]
Validation Epoch 2: 100%|██████████| 29/29 [00:01<00:00, 17.24it/s]


Epoch [2/10]
  Train Loss: 1.0785 | Train Acc: 0.4335
  Val Loss: 0.9522 | Val Acc: 0.5543
✅ Validation loss improved — model saved.


Train Epoch 3: 100%|██████████| 198/198 [00:46<00:00,  4.28it/s]
Validation Epoch 3: 100%|██████████| 29/29 [00:01<00:00, 17.27it/s]


Epoch [3/10]
  Train Loss: 0.9397 | Train Acc: 0.5691
  Val Loss: 0.8743 | Val Acc: 0.6208
✅ Validation loss improved — model saved.


Train Epoch 4: 100%|██████████| 198/198 [00:46<00:00,  4.28it/s]
Validation Epoch 4: 100%|██████████| 29/29 [00:01<00:00, 17.23it/s]


Epoch [4/10]
  Train Loss: 0.8257 | Train Acc: 0.6572
  Val Loss: 0.7314 | Val Acc: 0.6741
✅ Validation loss improved — model saved.


Train Epoch 5: 100%|██████████| 198/198 [00:46<00:00,  4.28it/s]
Validation Epoch 5: 100%|██████████| 29/29 [00:01<00:00, 17.31it/s]


Epoch [5/10]
  Train Loss: 0.7181 | Train Acc: 0.7148
  Val Loss: 0.7528 | Val Acc: 0.7073
⏰ No improvement — patience 1/3


Train Epoch 6: 100%|██████████| 198/198 [00:46<00:00,  4.28it/s]
Validation Epoch 6: 100%|██████████| 29/29 [00:01<00:00, 17.30it/s]


Epoch [6/10]
  Train Loss: 0.6271 | Train Acc: 0.7548
  Val Loss: 0.9693 | Val Acc: 0.6674
⏰ No improvement — patience 2/3


Train Epoch 7: 100%|██████████| 198/198 [00:46<00:00,  4.28it/s]
Validation Epoch 7: 100%|██████████| 29/29 [00:01<00:00, 17.23it/s]


Epoch [7/10]
  Train Loss: 0.5449 | Train Acc: 0.7864
  Val Loss: 0.7613 | Val Acc: 0.6984
⏰ No improvement — patience 3/3
🛑 Early stopping triggered at epoch 7

🔍 Loading best model for final evaluation...


Final Test Evaluation: 100%|██████████| 57/57 [00:03<00:00, 16.90it/s]


📊 FINAL TEST RESULTS:
Test Accuracy: 0.7073
Test Precision (Weighted): 0.7185
Test Recall (Weighted): 0.7073
Test F1-Score (Weighted): 0.7110
Test Loss: 0.7377

Confusion Matrix:
[[310  61  31]
 [ 56 234  63]
 [ 22  31  94]]

Per-Class Metrics:
------------------------------
Class 0:
  Precision: 0.7990
  Recall: 0.7711
  F1-Score: 0.7848
  Support: 402
Class 1:
  Precision: 0.7178
  Recall: 0.6629
  F1-Score: 0.6892
  Support: 353
Class 2:
  Precision: 0.5000
  Recall: 0.6395
  F1-Score: 0.5612
  Support: 147

✅ Results saved to 'xlm_roberta_results.json'
🎯 Text-only sentiment analysis with XLM-RoBERTa completed!



