In [1]:
# ================================================
# ✅ MULTIMODAL SENTIMENT ANALYSIS: CLIP + MuRIL
# ================================================

import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string
import numpy as np

# ================================================
# ✅ 1️⃣ SETUP & PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ================================================
# ✅ 2️⃣ LOAD & PREPROCESS DATA
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    
    # Check if both image and text exist
    if (os.path.exists(full_image_path) and 
        pd.notna(row['extracted_text']) and 
        row['extracted_text'].strip()):
        
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)
print(f"Total samples with both image and text: {len(processed_df)}")

# ================================================
# ✅ 3️⃣ TEXT CLEANING FUNCTION
# ================================================
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

# ================================================
# ✅ 4️⃣ DATA SPLIT
# ================================================
train_df, temp_df = train_test_split(
    processed_df, test_size=0.3, 
    stratify=processed_df['Label_Sentiment'], 
    random_state=42
)
test_df, val_df = train_test_split(
    temp_df, test_size=1/3, 
    stratify=temp_df['Label_Sentiment'], 
    random_state=42
)

# Clean text and add label column
for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_multimodal.csv', index=False)

print(f"Dataset sizes - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# ================================================
# ✅ 5️⃣ LOAD MODELS
# ================================================
# Load CLIP for image processing
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

# Load MuRIL for text processing
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril_model = AutoModel.from_pretrained("google/muril-base-cased").to(device)

# ================================================
# ✅ 6️⃣ MULTIMODAL DATASET
# ================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Load image
        image = Image.open(row['Image_path']).convert('RGB')
        
        # Load text
        text = row['Captions']
        label = row['label']
        
        # Tokenize text
        encoded = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'image': image,
            'input_ids': encoded['input_ids'].flatten(),
            'attention_mask': encoded['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def multimodal_collate_fn(batch):
    images = [item['image'] for item in batch]
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])
    
    return {
        'images': images,
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# ================================================
# ✅ 7️⃣ MULTIMODAL CLASSIFICATION MODEL
# ================================================
class MultimodalClassifier(torch.nn.Module):
    def __init__(self, clip_model, muril_model, num_classes=3, dropout=0.3):
        super().__init__()
        self.clip_model = clip_model
        self.muril_model = muril_model
        self.dropout = torch.nn.Dropout(dropout)
        
        # Get dimensions
        self.clip_dim = clip_model.config.projection_dim
        self.muril_dim = muril_model.config.hidden_size
        
        # Fusion layers
        self.fusion_dim = 256
        self.image_projection = torch.nn.Linear(self.clip_dim, self.fusion_dim)
        self.text_projection = torch.nn.Linear(self.muril_dim, self.fusion_dim)
        
        # Attention mechanism for fusion
        self.attention = torch.nn.MultiheadAttention(
            embed_dim=self.fusion_dim, 
            num_heads=8, 
            dropout=dropout,
            batch_first=True
        )
        
        # Final classifier
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(self.fusion_dim, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(128, num_classes)
        )
        
    def forward(self, images, input_ids, attention_mask):
        # Process images with CLIP
        image_inputs = clip_processor(images=images, return_tensors="pt").to(device)
        image_features = self.clip_model.get_image_features(**image_inputs)
        
        # Process text with MuRIL
        text_outputs = self.muril_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        text_features = text_outputs.last_hidden_state[:, 0, :]  # CLS token
        
        # Project to common dimension
        image_proj = self.image_projection(image_features)
        text_proj = self.text_projection(text_features)
        
        # Combine features using attention
        # Shape: [batch_size, 2, fusion_dim]
        combined_features = torch.stack([image_proj, text_proj], dim=1)
        
        # Apply self-attention
        attended_features, _ = self.attention(
            combined_features, combined_features, combined_features
        )
        
        # Global average pooling across the sequence dimension
        fused_features = attended_features.mean(dim=1)
        
        # Apply dropout and classify
        fused_features = self.dropout(fused_features)
        logits = self.classifier(fused_features)
        
        return logits

# ================================================
# ✅ 8️⃣ DATALOADERS
# ================================================
batch_size = 8

train_dataset = MultimodalDataset(train_df, tokenizer)
val_dataset = MultimodalDataset(val_df, tokenizer)
test_dataset = MultimodalDataset(test_df, tokenizer)

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=multimodal_collate_fn
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    collate_fn=multimodal_collate_fn
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    collate_fn=multimodal_collate_fn
)

# ================================================
# ✅ 9️⃣ INITIALIZE MODEL & OPTIMIZER
# ================================================
model = MultimodalClassifier(clip_model, muril_model, num_classes=3).to(device)

# Calculate class weights
class_weights = train_df['label'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))

# Different learning rates for different components
optimizer = AdamW([
    {'params': model.clip_model.parameters(), 'lr': 1e-5},
    {'params': model.muril_model.parameters(), 'lr': 2e-5},
    {'params': model.image_projection.parameters(), 'lr': 1e-4},
    {'params': model.text_projection.parameters(), 'lr': 1e-4},
    {'params': model.attention.parameters(), 'lr': 1e-4},
    {'params': model.classifier.parameters(), 'lr': 1e-4}
])

print(f"Class distribution: {class_weights}")
print(f"Class weights: {weights}")

# ================================================
# ✅ 🔟 TRAINING LOOP
# ================================================
num_epochs = 15
patience = 5
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    # ============================================================
    # TRAINING PHASE
    # ============================================================
    model.train()
    total_train_loss = 0
    train_predictions = []
    train_labels = []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        images = batch['images']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        
        logits = model(images, input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        
        # Store predictions for metrics
        predictions = torch.argmax(logits, dim=1)
        train_predictions.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_predictions)

    # ============================================================
    # VALIDATION PHASE
    # ============================================================
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            images = batch['images']
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(images, input_ids, attention_mask)
            loss = criterion(logits, labels)

            total_val_loss += loss.item()
            
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(
        val_labels, val_predictions, average='weighted'
    )
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")
    print(f"Val Precision: {val_precision:.4f} | Val Recall: {val_recall:.4f} | Val F1: {val_f1:.4f}")

    # ============================================================
    # EARLY STOPPING CHECK
    # ============================================================
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal_model.pt")
        print("✅ Validation loss improved — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break
    
    print("-" * 70)

# ================================================
# ✅ 1️⃣1️⃣ FINAL TEST EVALUATION
# ================================================
print("\n🔍 Loading best model for final evaluation...")
model.load_state_dict(torch.load("best_multimodal_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        images = batch['images']
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(images, input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        total_test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Calculate final metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
    test_labels, test_predictions, average='weighted'
)
cm = confusion_matrix(test_labels, test_predictions)

print("\n📊 FINAL TEST RESULTS (MULTIMODAL: CLIP + MuRIL):")
print("=" * 70)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision (Weighted): {precision:.4f}")
print(f"Test Recall (Weighted): {recall:.4f}")
print(f"Test F1-Score (Weighted): {f1:.4f}")
print(f"Test Loss: {total_test_loss/len(test_loader):.4f}")
print(f"\nConfusion Matrix:\n{cm}")

# ================================================
# ✅ 1️⃣2️⃣ DETAILED METRICS BY CLASS
# ================================================
precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
    test_labels, test_predictions, average=None
)

print("\n📋 PER-CLASS METRICS:")
print("=" * 50)
class_names = ['Negative', 'Neutral', 'Positive']
for i, class_name in enumerate(class_names):
    print(f"{class_name} (Class {i}):")
    print(f"  Precision: {precision_per_class[i]:.4f}")
    print(f"  Recall: {recall_per_class[i]:.4f}")
    print(f"  F1-Score: {f1_per_class[i]:.4f}")
    print(f"  Support: {support[i]}")
    print()

print(f"Total test samples: {len(test_labels)}")
print(f"Final Dataset sizes - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# ================================================
# ✅ 1️⃣3️⃣ FEATURE ANALYSIS (OPTIONAL)
# ================================================
print("\n🔍 MODEL ARCHITECTURE SUMMARY:")
print("=" * 50)
print(f"CLIP Image Features: {model.clip_dim}")
print(f"MuRIL Text Features: {model.muril_dim}")
print(f"Fusion Dimension: {model.fusion_dim}")
print(f"Total Parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

2025-07-07 10:27:16.463313: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751884036.691252      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751884036.756817      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda
Total samples with both image and text: 4509
Dataset sizes - Train: 3156, Val: 451, Test: 902


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Class distribution: [1404, 1237, 515]
Class weights: [2.247863247863248, 2.551333872271625, 6.128155339805825]



Train Epoch 1:   0%|          | 0/395 [00:00<?, ?it/s][A
Train Epoch 1:   0%|          | 1/395 [00:01<11:03,  1.69s/it][A
Train Epoch 1:   1%|          | 2/395 [00:02<07:12,  1.10s/it][A
Train Epoch 1:   1%|          | 3/395 [00:03<07:15,  1.11s/it][A
Train Epoch 1:   1%|          | 4/395 [00:04<06:23,  1.02it/s][A
Train Epoch 1:   1%|▏         | 5/395 [00:05<05:45,  1.13it/s][A
Train Epoch 1:   2%|▏         | 6/395 [00:05<05:42,  1.14it/s][A
Train Epoch 1:   2%|▏         | 7/395 [00:06<05:45,  1.12it/s][A
Train Epoch 1:   2%|▏         | 8/395 [00:07<05:36,  1.15it/s][A
Train Epoch 1:   2%|▏         | 9/395 [00:08<05:35,  1.15it/s][A
Train Epoch 1:   3%|▎         | 10/395 [00:09<06:45,  1.05s/it][A
Train Epoch 1:   3%|▎         | 11/395 [00:10<06:20,  1.01it/s][A
Train Epoch 1:   3%|▎         | 12/395 [00:11<06:00,  1.06it/s][A
Train Epoch 1:   3%|▎         | 13/395 [00:12<06:03,  1.05it/s][A
Train Epoch 1:   4%|▎         | 14/395 [00:13<06:19,  1.00it/s][A
Train Epoch 

Epoch [1/15]
Train Loss: 0.9855 | Train Acc: 0.5314
Val Loss: 0.8901 | Val Acc: 0.6275
Val Precision: 0.6559 | Val Recall: 0.6275 | Val F1: 0.6285
✅ Validation loss improved — model saved.
----------------------------------------------------------------------


Train Epoch 2: 100%|██████████| 395/395 [03:14<00:00,  2.03it/s]
Validation Epoch 2: 100%|██████████| 57/57 [00:20<00:00,  2.82it/s]


Epoch [2/15]
Train Loss: 0.8464 | Train Acc: 0.6169
Val Loss: 0.8731 | Val Acc: 0.6408
Val Precision: 0.6573 | Val Recall: 0.6408 | Val F1: 0.6411
✅ Validation loss improved — model saved.
----------------------------------------------------------------------


Train Epoch 3: 100%|██████████| 395/395 [03:12<00:00,  2.05it/s]
Validation Epoch 3: 100%|██████████| 57/57 [00:20<00:00,  2.81it/s]


Epoch [3/15]
Train Loss: 0.7085 | Train Acc: 0.7050
Val Loss: 0.9231 | Val Acc: 0.5654
Val Precision: 0.6362 | Val Recall: 0.5654 | Val F1: 0.5792
⏰ No improvement — patience 1/5
----------------------------------------------------------------------


Train Epoch 4: 100%|██████████| 395/395 [03:12<00:00,  2.06it/s]
Validation Epoch 4: 100%|██████████| 57/57 [00:20<00:00,  2.83it/s]


Epoch [4/15]
Train Loss: 0.5131 | Train Acc: 0.8083
Val Loss: 1.0303 | Val Acc: 0.6984
Val Precision: 0.7084 | Val Recall: 0.6984 | Val F1: 0.6960
⏰ No improvement — patience 2/5
----------------------------------------------------------------------


Train Epoch 5: 100%|██████████| 395/395 [03:13<00:00,  2.04it/s]
Validation Epoch 5: 100%|██████████| 57/57 [00:20<00:00,  2.73it/s]


Epoch [5/15]
Train Loss: 0.3272 | Train Acc: 0.8856
Val Loss: 1.0042 | Val Acc: 0.6785
Val Precision: 0.7234 | Val Recall: 0.6785 | Val F1: 0.6916
⏰ No improvement — patience 3/5
----------------------------------------------------------------------


Train Epoch 6: 100%|██████████| 395/395 [03:13<00:00,  2.04it/s]
Validation Epoch 6: 100%|██████████| 57/57 [00:20<00:00,  2.78it/s]


Epoch [6/15]
Train Loss: 0.2276 | Train Acc: 0.9240
Val Loss: 1.2746 | Val Acc: 0.6896
Val Precision: 0.7437 | Val Recall: 0.6896 | Val F1: 0.7035
⏰ No improvement — patience 4/5
----------------------------------------------------------------------


Train Epoch 7: 100%|██████████| 395/395 [03:14<00:00,  2.04it/s]
Validation Epoch 7: 100%|██████████| 57/57 [00:20<00:00,  2.82it/s]


Epoch [7/15]
Train Loss: 0.1196 | Train Acc: 0.9613
Val Loss: 1.2575 | Val Acc: 0.6984
Val Precision: 0.7263 | Val Recall: 0.6984 | Val F1: 0.7025
⏰ No improvement — patience 5/5
🛑 Early stopping triggered at epoch 7

🔍 Loading best model for final evaluation...


Final Test Evaluation: 100%|██████████| 113/113 [00:42<00:00,  2.66it/s]


📊 FINAL TEST RESULTS (MULTIMODAL: CLIP + MuRIL):
Test Accuracy: 0.6530
Test Precision (Weighted): 0.6608
Test Recall (Weighted): 0.6530
Test F1-Score (Weighted): 0.6492
Test Loss: 0.8086

Confusion Matrix:
[[267 107  28]
 [ 57 269  27]
 [ 18  76  53]]

📋 PER-CLASS METRICS:
Negative (Class 0):
  Precision: 0.7807
  Recall: 0.6642
  F1-Score: 0.7177
  Support: 402

Neutral (Class 1):
  Precision: 0.5951
  Recall: 0.7620
  F1-Score: 0.6683
  Support: 353

Positive (Class 2):
  Precision: 0.4907
  Recall: 0.3605
  F1-Score: 0.4157
  Support: 147

Total test samples: 902
Final Dataset sizes - Train: 3156, Val: 451, Test: 902

🔍 MODEL ARCHITECTURE SUMMARY:
CLIP Image Features: 512
MuRIL Text Features: 768
Fusion Dimension: 256
Total Parameters: 389,458,180
Trainable Parameters: 389,458,180



