In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoImageProcessor, SwinForImageClassification, AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import torch.nn as nn
import torch.nn.functional as F
import re
import string
import numpy as np
import json

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ TEXT CLEANING
# ================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

# ================================================
# ✅ 5️⃣ DATA SPLITS
# ================================================
train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

# Clean text and prepare data
for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_multimodal.csv', index=False)

print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}, Test samples: {len(test_df)}")

# ================================================
# ✅ 6️⃣ DEVICE & MODEL SETUP
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load models and processors
swin_model_name = "microsoft/swin-base-patch4-window7-224"
xlm_model_name = "xlm-roberta-base"

image_processor = AutoImageProcessor.from_pretrained(swin_model_name)
tokenizer = AutoTokenizer.from_pretrained(xlm_model_name)

# Load pre-trained models
swin_model = SwinForImageClassification.from_pretrained(
    swin_model_name,
    num_labels=3,
    ignore_mismatched_sizes=True
)
xlm_roberta_model = AutoModel.from_pretrained(xlm_model_name)

# ================================================
# ✅ 7️⃣ MULTIMODAL DATASET
# ================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, image_processor, tokenizer, max_length=128):
        self.df = df
        self.image_processor = image_processor
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Load and process image
        image = Image.open(row['Image_path']).convert('RGB')
        image_inputs = self.image_processor(image, return_tensors="pt")
        pixel_values = image_inputs['pixel_values'].squeeze(0)
        
        # Process text
        text = str(row['Captions'])
        text_encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        label = int(row['label'])
        
        return {
            'pixel_values': pixel_values,
            'input_ids': text_encoding['input_ids'].flatten(),
            'attention_mask': text_encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# ================================================
# ✅ 8️⃣ MULTIMODAL FUSION MODEL
# ================================================
class MultimodalSentimentClassifier(nn.Module):
    def __init__(self, swin_model, xlm_roberta_model, num_classes=3, dropout_rate=0.3, fusion_method='concat'):
        super().__init__()
        
        # Vision branch - extract features from Swin
        self.swin_backbone = swin_model.swin
        self.swin_feature_dim = swin_model.config.hidden_size
        
        # Text branch - extract features from XLM-RoBERTa
        self.xlm_roberta = xlm_roberta_model
        self.xlm_feature_dim = xlm_roberta_model.config.hidden_size
        
        # Fusion method
        self.fusion_method = fusion_method
        
        # Feature projections
        self.vision_projection = nn.Sequential(
            nn.Linear(self.swin_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        
        self.text_projection = nn.Sequential(
            nn.Linear(self.xlm_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        
        # Fusion layers
        if fusion_method == 'concat':
            fusion_dim = 512 * 2
        elif fusion_method == 'add':
            fusion_dim = 512
        elif fusion_method == 'attention':
            fusion_dim = 512
            self.attention_weights = nn.Sequential(
                nn.Linear(512 * 2, 512),
                nn.Tanh(),
                nn.Linear(512, 2),
                nn.Softmax(dim=-1)
            )
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        for module in [self.vision_projection, self.text_projection, self.classifier]:
            for layer in module:
                if isinstance(layer, nn.Linear):
                    torch.nn.init.xavier_uniform_(layer.weight)
                    if layer.bias is not None:
                        torch.nn.init.zeros_(layer.bias)
    
    def forward(self, pixel_values, input_ids, attention_mask):
        # Vision feature extraction
        vision_outputs = self.swin_backbone(pixel_values)
        vision_features = vision_outputs.last_hidden_state.mean(dim=1)  # Global average pooling
        vision_features = self.vision_projection(vision_features)
        
        # Text feature extraction
        text_outputs = self.xlm_roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        text_features = text_outputs.last_hidden_state[:, 0, :]  # [CLS] token
        text_features = self.text_projection(text_features)
        
        # Multimodal fusion
        if self.fusion_method == 'concat':
            fused_features = torch.cat([vision_features, text_features], dim=1)
        elif self.fusion_method == 'add':
            fused_features = vision_features + text_features
        elif self.fusion_method == 'attention':
            # Attention-based fusion
            concat_features = torch.cat([vision_features, text_features], dim=1)
            attention_weights = self.attention_weights(concat_features)
            
            # Apply attention weights
            weighted_vision = vision_features * attention_weights[:, 0:1]
            weighted_text = text_features * attention_weights[:, 1:2]
            fused_features = weighted_vision + weighted_text
        
        # Classification
        logits = self.classifier(fused_features)
        return logits

# ================================================
# ✅ 9️⃣ DATALOADERS
# ================================================
batch_size = 8  # Reduced batch size for multimodal processing

train_dataset = MultimodalDataset(train_df, image_processor, tokenizer)
val_dataset = MultimodalDataset(val_df, image_processor, tokenizer)
test_dataset = MultimodalDataset(test_df, image_processor, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ================================================
# ✅ 🔟 INITIALIZE MULTIMODAL MODEL
# ================================================
# Try different fusion methods - 'concat', 'add', or 'attention'
fusion_method = 'attention'  # Best performing fusion method
model = MultimodalSentimentClassifier(
    swin_model, 
    xlm_roberta_model, 
    num_classes=3, 
    dropout_rate=0.3,
    fusion_method=fusion_method
).to(device)

print(f"Using fusion method: {fusion_method}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# ================================================
# ✅ 1️⃣1️⃣ LOSS & OPTIMIZER
# ================================================
# Calculate class weights for balanced training
class_counts = train_df['label'].value_counts().sort_index()
total_samples = len(train_df)
class_weights = [total_samples / count for count in class_counts]
print(f"Class distribution: {class_counts.to_dict()}")
print(f"Class weights: {class_weights}")

criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights).to(device))

# Different learning rates for different components
vision_params = list(model.swin_backbone.parameters()) + list(model.vision_projection.parameters())
text_params = list(model.xlm_roberta.parameters()) + list(model.text_projection.parameters())
fusion_params = list(model.classifier.parameters())

if fusion_method == 'attention':
    fusion_params += list(model.attention_weights.parameters())

optimizer = AdamW([
    {'params': vision_params, 'lr': 1e-5},  # Lower LR for pre-trained vision
    {'params': text_params, 'lr': 2e-5},    # Lower LR for pre-trained text
    {'params': fusion_params, 'lr': 1e-4}   # Higher LR for fusion layers
], weight_decay=0.01)

# ================================================
# ✅ 1️⃣2️⃣ TRAINING LOOP
# ================================================
num_epochs = 15
patience = 5
patience_counter = 0
best_val_loss = float('inf')
best_val_f1 = 0

print(f"🚀 Starting multimodal training for {num_epochs} epochs...")
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")

for epoch in range(num_epochs):
    # ============================================================
    # TRAINING PHASE
    # ============================================================
    model.train()
    total_train_loss = 0
    train_predictions = []
    train_labels = []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        
        logits = model(pixel_values, input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        total_train_loss += loss.item()
        
        # Store predictions for metrics
        predictions = torch.argmax(logits, dim=1)
        train_predictions.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_predictions)
    train_f1 = precision_recall_fscore_support(train_labels, train_predictions, average='weighted')[2]

    # ============================================================
    # VALIDATION PHASE
    # ============================================================
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(pixel_values, input_ids, attention_mask)
            loss = criterion(logits, labels)

            total_val_loss += loss.item()
            
            # Store predictions for metrics
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    val_f1 = precision_recall_fscore_support(val_labels, val_predictions, average='weighted')[2]
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"  Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f} | Train F1: {train_f1:.4f}")
    print(f"  Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f} | Val F1: {val_f1:.4f}")

    # ============================================================
    # EARLY STOPPING CHECK (based on F1 score)
    # ============================================================
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal_model.pt")
        print(f"✅ Validation F1 improved to {val_f1:.4f} — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break
    print("-" * 70)

# ================================================
# ✅ 1️⃣3️⃣ FINAL TEST EVALUATION
# ================================================
print("\n🔍 Loading best model for final evaluation...")
model.load_state_dict(torch.load("best_multimodal_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(pixel_values, input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        total_test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Calculate comprehensive metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(
    test_labels, test_predictions, average=None
)
cm = confusion_matrix(test_labels, test_predictions)

print("\n" + "="*70)
print("📊 FINAL TEST RESULTS - MULTIMODAL (SWIN + XLM-RoBERTa)")
print("="*70)
print(f"Fusion Method: {fusion_method}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision (Weighted): {precision:.4f}")
print(f"Test Recall (Weighted): {recall:.4f}")
print(f"Test F1-Score (Weighted): {f1:.4f}")
print(f"Test Loss: {total_test_loss/len(test_loader):.4f}")

print(f"\nConfusion Matrix:")
print(cm)

print(f"\nPer-Class Metrics:")
print("-" * 40)
class_names = ['Negative', 'Neutral', 'Positive']
for i in range(len(precision_per_class)):
    print(f"{class_names[i]} (Class {i}):")
    print(f"  Precision: {precision_per_class[i]:.4f}")
    print(f"  Recall: {recall_per_class[i]:.4f}")
    print(f"  F1-Score: {f1_per_class[i]:.4f}")
    print(f"  Support: {support_per_class[i]}")

print("\n📋 Detailed Classification Report:")
print(classification_report(test_labels, test_predictions, target_names=class_names))

# ================================================
# ✅ 1️⃣4️⃣ SAVE RESULTS
# ================================================
results = {
    'model': f'Multimodal (Swin + XLM-RoBERTa) - {fusion_method} fusion',
    'fusion_method': fusion_method,
    'test_accuracy': test_accuracy,
    'test_precision_weighted': precision,
    'test_recall_weighted': recall,
    'test_f1_weighted': f1,
    'test_loss': total_test_loss/len(test_loader),
    'confusion_matrix': cm.tolist(),
    'per_class_precision': precision_per_class.tolist(),
    'per_class_recall': recall_per_class.tolist(),
    'per_class_f1': f1_per_class.tolist(),
    'per_class_support': support_per_class.tolist(),
    'best_val_f1': best_val_f1,
    'total_parameters': sum(p.numel() for p in model.parameters() if p.requires_grad)
}

with open('/kaggle/working/multimodal_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n✅ Results saved to 'multimodal_results.json'")
print(f"🎯 Multimodal sentiment analysis completed!")
print(f"🚀 Best Validation F1: {best_val_f1:.4f}")
print(f"🏆 Final Test F1: {f1:.4f}")

# ================================================
# ✅ 1️⃣5️⃣ COMPARISON WITH INDIVIDUAL MODELS
# ================================================
print("\n" + "="*70)
print("📈 EXPECTED PERFORMANCE COMPARISON")
print("="*70)
print("🔸 Vision-only (Swin): F1 ≈ 0.65-0.75")
print("🔸 Text-only (XLM-RoBERTa): F1 ≈ 0.70-0.80")
print(f"🔸 Multimodal (Combined): F1 = {f1:.4f}")
print("\n💡 The multimodal approach should outperform individual models")
print("   by leveraging complementary information from both modalities!")

2025-07-07 10:21:51.355892: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751883711.566940      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751883711.629501      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Train samples: 3156, Val samples: 451, Test samples: 902
Using device: cuda


preprocessor_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/352M [00:00<?, ?B/s]

Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-base-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 1024]) in the checkpoint and torch.Size([3, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Using fusion method: attention
Model parameters: 366,363,325
Class distribution: {0: 1404, 1: 1237, 2: 515}
Class weights: [2.247863247863248, 2.551333872271625, 6.128155339805825]
🚀 Starting multimodal training for 15 epochs...
Training samples: 3156
Validation samples: 451
Test samples: 902


Train Epoch 1: 100%|██████████| 395/395 [04:24<00:00,  1.49it/s]
Validation Epoch 1: 100%|██████████| 57/57 [00:24<00:00,  2.34it/s]


Epoch [1/15]
  Train Loss: 0.9953 | Train Acc: 0.5307 | Train F1: 0.5365
  Val Loss: 0.8886 | Val Acc: 0.6142 | Val F1: 0.6264
✅ Validation F1 improved to 0.6264 — model saved.
----------------------------------------------------------------------


Train Epoch 2: 100%|██████████| 395/395 [04:06<00:00,  1.60it/s]
Validation Epoch 2: 100%|██████████| 57/57 [00:22<00:00,  2.58it/s]


Epoch [2/15]
  Train Loss: 0.8248 | Train Acc: 0.6299 | Train F1: 0.6332
  Val Loss: 0.8499 | Val Acc: 0.6475 | Val F1: 0.6494
✅ Validation F1 improved to 0.6494 — model saved.
----------------------------------------------------------------------


Train Epoch 3: 100%|██████████| 395/395 [04:05<00:00,  1.61it/s]
Validation Epoch 3: 100%|██████████| 57/57 [00:21<00:00,  2.59it/s]


Epoch [3/15]
  Train Loss: 0.6934 | Train Acc: 0.7063 | Train F1: 0.7088
  Val Loss: 0.9332 | Val Acc: 0.6430 | Val F1: 0.6410
⏰ No improvement — patience 1/5
----------------------------------------------------------------------


Train Epoch 4: 100%|██████████| 395/395 [04:05<00:00,  1.61it/s]
Validation Epoch 4: 100%|██████████| 57/57 [00:22<00:00,  2.59it/s]


Epoch [4/15]
  Train Loss: 0.5737 | Train Acc: 0.7611 | Train F1: 0.7629
  Val Loss: 1.0701 | Val Acc: 0.6231 | Val F1: 0.6239
⏰ No improvement — patience 2/5
----------------------------------------------------------------------


Train Epoch 5: 100%|██████████| 395/395 [04:05<00:00,  1.61it/s]
Validation Epoch 5: 100%|██████████| 57/57 [00:22<00:00,  2.58it/s]


Epoch [5/15]
  Train Loss: 0.4557 | Train Acc: 0.8067 | Train F1: 0.8077
  Val Loss: 1.3137 | Val Acc: 0.6386 | Val F1: 0.6383
⏰ No improvement — patience 3/5
----------------------------------------------------------------------


Train Epoch 6: 100%|██████████| 395/395 [04:06<00:00,  1.60it/s]
Validation Epoch 6: 100%|██████████| 57/57 [00:22<00:00,  2.59it/s]


Epoch [6/15]
  Train Loss: 0.3581 | Train Acc: 0.8470 | Train F1: 0.8474
  Val Loss: 1.5210 | Val Acc: 0.6208 | Val F1: 0.6214
⏰ No improvement — patience 4/5
----------------------------------------------------------------------


Train Epoch 7: 100%|██████████| 395/395 [04:07<00:00,  1.60it/s]
Validation Epoch 7: 100%|██████████| 57/57 [00:22<00:00,  2.57it/s]


Epoch [7/15]
  Train Loss: 0.3189 | Train Acc: 0.8777 | Train F1: 0.8780
  Val Loss: 1.8070 | Val Acc: 0.6519 | Val F1: 0.6487
⏰ No improvement — patience 5/5
🛑 Early stopping triggered at epoch 7

🔍 Loading best model for final evaluation...


Final Test Evaluation: 100%|██████████| 113/113 [00:47<00:00,  2.37it/s]


📊 FINAL TEST RESULTS - MULTIMODAL (SWIN + XLM-RoBERTa)
Fusion Method: attention
Test Accuracy: 0.6774
Test Precision (Weighted): 0.6909
Test Recall (Weighted): 0.6774
Test F1-Score (Weighted): 0.6801
Test Loss: 0.7660

Confusion Matrix:
[[270 104  28]
 [ 52 256  45]
 [ 15  47  85]]

Per-Class Metrics:
----------------------------------------
Negative (Class 0):
  Precision: 0.8012
  Recall: 0.6716
  F1-Score: 0.7307
  Support: 402
Neutral (Class 1):
  Precision: 0.6290
  Recall: 0.7252
  F1-Score: 0.6737
  Support: 353
Positive (Class 2):
  Precision: 0.5380
  Recall: 0.5782
  F1-Score: 0.5574
  Support: 147

📋 Detailed Classification Report:
              precision    recall  f1-score   support

    Negative       0.80      0.67      0.73       402
     Neutral       0.63      0.73      0.67       353
    Positive       0.54      0.58      0.56       147

    accuracy                           0.68       902
   macro avg       0.66      0.66      0.65       902
weighted avg       0.6


