In [1]:
# ================================================
# ✅ MULTIMODAL BENGALI SENTIMENT ANALYSIS
# CLIP (Vision) + XLM-RoBERTa (Text) 
# ================================================

import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string
import numpy as np
import json

# ================================================
# ✅ 1️⃣ PATHS & SETUP
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ================================================
# ✅ 2️⃣ LOAD & PREPROCESS DATA
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1  # Convert to 0-indexed
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)
print(f"Total samples with existing images: {len(processed_df)}")

# ================================================
# ✅ 3️⃣ TEXT CLEANING FUNCTION
# ================================================
def clean_text(text):
    if pd.isna(text): 
        return ""
    text = str(text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = " ".join(text.split())  # Clean whitespace
    return text

# ================================================
# ✅ 4️⃣ DATA SPLIT
# ================================================
train_df, temp_df = train_test_split(
    processed_df, 
    test_size=0.3, 
    stratify=processed_df['Label_Sentiment'], 
    random_state=42
)
test_df, val_df = train_test_split(
    temp_df, 
    test_size=1/3, 
    stratify=temp_df['Label_Sentiment'], 
    random_state=42
)

# Clean text and add label column
for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_multimodal.csv', index=False)
    print(f"{df_name.capitalize()} set: {len(df_)} samples")

# ================================================
# ✅ 5️⃣ LOAD MODELS
# ================================================
print("Loading CLIP model...")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

print("Loading XLM-RoBERTa model...")
xlm_model_name = "xlm-roberta-base"
xlm_tokenizer = AutoTokenizer.from_pretrained(xlm_model_name)
xlm_model = AutoModel.from_pretrained(xlm_model_name).to(device)

# Get feature dimensions
dummy_image = Image.new('RGB', (224, 224))
dummy_img_input = clip_processor(images=dummy_image, return_tensors="pt").to(device)
img_dim = clip_model.get_image_features(**dummy_img_input).shape[1]
text_dim = xlm_model.config.hidden_size

print(f"Image feature dimension: {img_dim}")
print(f"Text feature dimension: {text_dim}")

# ================================================
# ✅ 6️⃣ MULTIMODAL DATASET
# ================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, clip_processor, xlm_tokenizer, max_length=128):
        self.df = df
        self.clip_processor = clip_processor
        self.xlm_tokenizer = xlm_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Load image
        image = Image.open(row['Image_path']).convert('RGB')
        
        # Process text
        text = str(row['Captions'])
        text_encoding = self.xlm_tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        label = int(row['label'])
        
        return {
            'image': image,
            'input_ids': text_encoding['input_ids'].flatten(),
            'attention_mask': text_encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def collate_fn(batch):
    images = [item['image'] for item in batch]
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])
    
    return {
        'images': images,
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# ================================================
# ✅ 7️⃣ MULTIMODAL CLASSIFIER
# ================================================
class MultimodalSentimentClassifier(torch.nn.Module):
    def __init__(self, img_dim, text_dim, num_classes=3, dropout_rate=0.3):
        super().__init__()
        
        # Feature projections
        self.img_projection = torch.nn.Linear(img_dim, 256)
        self.text_projection = torch.nn.Linear(text_dim, 256)
        
        # Fusion layers
        self.fusion = torch.nn.Sequential(
            torch.nn.Linear(512, 256),  # img_proj + text_proj
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_rate),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_rate)
        )
        
        # Final classifier
        self.classifier = torch.nn.Linear(128, num_classes)
        
        # Dropout
        self.dropout = torch.nn.Dropout(dropout_rate)
        
    def forward(self, img_features, text_features):
        # Project features to same dimension
        img_proj = self.img_projection(img_features)
        text_proj = self.text_projection(text_features)
        
        # Concatenate features
        fused_features = torch.cat([img_proj, text_proj], dim=1)
        
        # Pass through fusion layers
        fused_features = self.fusion(fused_features)
        
        # Final classification
        logits = self.classifier(fused_features)
        
        return logits

# ================================================
# ✅ 8️⃣ INITIALIZE MODEL & TRAINING SETUP
# ================================================
model = MultimodalSentimentClassifier(img_dim, text_dim).to(device)

# Calculate class weights
class_counts = train_df['label'].value_counts().sort_index()
total_samples = len(train_df)
class_weights = [total_samples / count for count in class_counts]
print(f"Class distribution: {class_counts.to_dict()}")
print(f"Class weights: {class_weights}")

criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights).to(device))
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

# ================================================
# ✅ 9️⃣ DATALOADERS
# ================================================
batch_size = 8

train_dataset = MultimodalDataset(train_df, clip_processor, xlm_tokenizer)
val_dataset = MultimodalDataset(val_df, clip_processor, xlm_tokenizer)
test_dataset = MultimodalDataset(test_df, clip_processor, xlm_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

# ================================================
# ✅ 🔟 TRAINING LOOP
# ================================================
num_epochs = 15
patience = 3  # As requested
patience_counter = 0
best_val_loss = float('inf')

print(f"Starting multimodal training for {num_epochs} epochs...")
print(f"Patience: {patience}")

for epoch in range(num_epochs):
    # ============================================================
    # TRAINING PHASE
    # ============================================================
    model.train()
    clip_model.eval()  # Keep CLIP frozen
    xlm_model.eval()   # Keep XLM-RoBERTa frozen
    
    total_train_loss = 0
    train_predictions = []
    train_labels = []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        images = batch['images']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        
        # Extract image features
        with torch.no_grad():
            img_inputs = clip_processor(images=images, return_tensors="pt").to(device)
            img_features = clip_model.get_image_features(**img_inputs)
        
        # Extract text features
        with torch.no_grad():
            text_outputs = xlm_model(input_ids=input_ids, attention_mask=attention_mask)
            text_features = text_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Forward pass through multimodal classifier
        logits = model(img_features, text_features)
        loss = criterion(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        
        # Store predictions for metrics
        predictions = torch.argmax(logits, dim=1)
        train_predictions.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_predictions)

    # ============================================================
    # VALIDATION PHASE
    # ============================================================
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            images = batch['images']
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Extract features
            img_inputs = clip_processor(images=images, return_tensors="pt").to(device)
            img_features = clip_model.get_image_features(**img_inputs)
            
            text_outputs = xlm_model(input_ids=input_ids, attention_mask=attention_mask)
            text_features = text_outputs.last_hidden_state[:, 0, :]
            
            # Forward pass
            logits = model(img_features, text_features)
            loss = criterion(logits, labels)

            total_val_loss += loss.item()
            
            # Store predictions for metrics
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"  Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f}")
    print(f"  Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    # ============================================================
    # EARLY STOPPING CHECK
    # ============================================================
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "/kaggle/working/best_multimodal_model.pt")
        print("✅ Validation loss improved — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break

# ================================================
# ✅ 1️⃣1️⃣ FINAL TEST EVALUATION
# ================================================
print("\n🔍 Loading best model for final evaluation...")
model.load_state_dict(torch.load("/kaggle/working/best_multimodal_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        images = batch['images']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Extract features
        img_inputs = clip_processor(images=images, return_tensors="pt").to(device)
        img_features = clip_model.get_image_features(**img_inputs)
        
        text_outputs = xlm_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0, :]
        
        # Forward pass
        logits = model(img_features, text_features)
        loss = criterion(logits, labels)
        
        total_test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# ================================================
# ✅ 1️⃣2️⃣ CALCULATE METRICS
# ================================================
test_accuracy = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
cm = confusion_matrix(test_labels, test_predictions)

# Per-class metrics
precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(
    test_labels, test_predictions, average=None
)

print("\n📊 FINAL MULTIMODAL TEST RESULTS:")
print("=" * 60)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision (Weighted): {precision:.4f}")
print(f"Test Recall (Weighted): {recall:.4f}")
print(f"Test F1-Score (Weighted): {f1:.4f}")
print(f"Test Loss: {total_test_loss/len(test_loader):.4f}")

print(f"\nConfusion Matrix:")
print(cm)

print(f"\nPer-Class Metrics:")
print("-" * 30)
for i in range(len(precision_per_class)):
    print(f"Class {i} (Sentiment):")
    print(f"  Precision: {precision_per_class[i]:.4f}")
    print(f"  Recall: {recall_per_class[i]:.4f}")
    print(f"  F1-Score: {f1_per_class[i]:.4f}")
    print(f"  Support: {support_per_class[i]}")

# ================================================
# ✅ 1️⃣3️⃣ SAVE RESULTS
# ================================================
results = {
    'model': 'Multimodal (CLIP + XLM-RoBERTa)',
    'test_accuracy': float(test_accuracy),
    'test_precision_weighted': float(precision),
    'test_recall_weighted': float(recall),
    'test_f1_weighted': float(f1),
    'test_loss': float(total_test_loss/len(test_loader)),
    'confusion_matrix': cm.tolist(),
    'per_class_precision': precision_per_class.tolist(),
    'per_class_recall': recall_per_class.tolist(),
    'per_class_f1': f1_per_class.tolist(),
    'per_class_support': support_per_class.tolist(),
    'training_params': {
        'epochs': epoch + 1,
        'patience': patience,
        'batch_size': batch_size,
        'learning_rate': 1e-4,
        'early_stopped': patience_counter >= patience
    }
}

with open('/kaggle/working/multimodal_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\n✅ Results saved to 'multimodal_results.json'")
print("🎯 Multimodal Bengali sentiment analysis completed!")
print(f"🔥 Final Test Accuracy: {test_accuracy:.4f}")

2025-07-07 12:08:24.474206: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751890104.657534      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751890104.707116      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda
Total samples with existing images: 4509
Train set: 3156 samples
Test set: 902 samples
Val set: 451 samples
Loading CLIP model...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Loading XLM-RoBERTa model...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Image feature dimension: 512
Text feature dimension: 768
Class distribution: {0: 1404, 1: 1237, 2: 515}
Class weights: [2.247863247863248, 2.551333872271625, 6.128155339805825]
Starting multimodal training for 15 epochs...
Patience: 3


Train Epoch 1: 100%|██████████| 395/395 [02:30<00:00,  2.62it/s]
Validation Epoch 1: 100%|██████████| 57/57 [00:22<00:00,  2.50it/s]


Epoch [1/15]
  Train Loss: 0.9948 | Train Acc: 0.5361
  Val Loss: 0.8751 | Val Acc: 0.6475
✅ Validation loss improved — model saved.


Train Epoch 2: 100%|██████████| 395/395 [02:09<00:00,  3.04it/s]
Validation Epoch 2: 100%|██████████| 57/57 [00:19<00:00,  2.91it/s]


Epoch [2/15]
  Train Loss: 0.8555 | Train Acc: 0.6258
  Val Loss: 0.8442 | Val Acc: 0.6541
✅ Validation loss improved — model saved.


Train Epoch 3: 100%|██████████| 395/395 [02:10<00:00,  3.04it/s]
Validation Epoch 3: 100%|██████████| 57/57 [00:19<00:00,  2.87it/s]


Epoch [3/15]
  Train Loss: 0.8135 | Train Acc: 0.6420
  Val Loss: 0.8423 | Val Acc: 0.6696
✅ Validation loss improved — model saved.


Train Epoch 4: 100%|██████████| 395/395 [02:11<00:00,  3.01it/s]
Validation Epoch 4: 100%|██████████| 57/57 [00:19<00:00,  2.86it/s]


Epoch [4/15]
  Train Loss: 0.7879 | Train Acc: 0.6502
  Val Loss: 0.8360 | Val Acc: 0.6585
✅ Validation loss improved — model saved.


Train Epoch 5: 100%|██████████| 395/395 [02:10<00:00,  3.02it/s]
Validation Epoch 5: 100%|██████████| 57/57 [00:20<00:00,  2.83it/s]


Epoch [5/15]
  Train Loss: 0.7610 | Train Acc: 0.6800
  Val Loss: 0.8407 | Val Acc: 0.6630
⏰ No improvement — patience 1/3


Train Epoch 6: 100%|██████████| 395/395 [02:10<00:00,  3.02it/s]
Validation Epoch 6: 100%|██████████| 57/57 [00:20<00:00,  2.84it/s]


Epoch [6/15]
  Train Loss: 0.7431 | Train Acc: 0.6841
  Val Loss: 0.8276 | Val Acc: 0.6674
✅ Validation loss improved — model saved.


Train Epoch 7: 100%|██████████| 395/395 [02:11<00:00,  3.00it/s]
Validation Epoch 7: 100%|██████████| 57/57 [00:19<00:00,  2.87it/s]


Epoch [7/15]
  Train Loss: 0.7260 | Train Acc: 0.6993
  Val Loss: 0.8483 | Val Acc: 0.6696
⏰ No improvement — patience 1/3


Train Epoch 8: 100%|██████████| 395/395 [02:10<00:00,  3.02it/s]
Validation Epoch 8: 100%|██████████| 57/57 [00:19<00:00,  2.87it/s]


Epoch [8/15]
  Train Loss: 0.7032 | Train Acc: 0.7025
  Val Loss: 0.8548 | Val Acc: 0.6608
⏰ No improvement — patience 2/3


Train Epoch 9: 100%|██████████| 395/395 [02:10<00:00,  3.02it/s]
Validation Epoch 9: 100%|██████████| 57/57 [00:19<00:00,  2.88it/s]


Epoch [9/15]
  Train Loss: 0.6848 | Train Acc: 0.7018
  Val Loss: 0.8999 | Val Acc: 0.6386
⏰ No improvement — patience 3/3
🛑 Early stopping triggered at epoch 9

🔍 Loading best model for final evaluation...


Final Test Evaluation: 100%|██████████| 113/113 [00:43<00:00,  2.59it/s]


📊 FINAL MULTIMODAL TEST RESULTS:
Test Accuracy: 0.6940
Test Precision (Weighted): 0.7220
Test Recall (Weighted): 0.6940
Test F1-Score (Weighted): 0.6925
Test Loss: 0.7591

Confusion Matrix:
[[258 125  19]
 [ 27 298  28]
 [ 14  63  70]]

Per-Class Metrics:
------------------------------
Class 0 (Sentiment):
  Precision: 0.8629
  Recall: 0.6418
  F1-Score: 0.7361
  Support: 402
Class 1 (Sentiment):
  Precision: 0.6132
  Recall: 0.8442
  F1-Score: 0.7104
  Support: 353
Class 2 (Sentiment):
  Precision: 0.5983
  Recall: 0.4762
  F1-Score: 0.5303
  Support: 147

✅ Results saved to 'multimodal_results.json'
🎯 Multimodal Bengali sentiment analysis completed!
🔥 Final Test Accuracy: 0.6940



