In [1]:
# ================================================
# ✅ MULTIMODAL FUSION MODEL FOR BEST F1 SCORE
# ================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor, SwinForImageClassification
from torch.optim import AdamW
import torchvision.transforms as T
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import torch.nn as nn
import torch.nn.functional as F
import re
import string
import json
from transformers import AutoTokenizer, AutoModel, AutoModelForPreTraining, AutoImageProcessor, SwinForImageClassification
from transformers import AutoModel

# ================================================
# ✅ PATHS & SETUP
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ================================================
# ✅ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ TEXT CLEANING
# ================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

# ================================================
# ✅ DATA SPLITS
# ================================================
train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']

print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}, Test samples: {len(test_df)}")
print(f"Class distribution: {train_df['label'].value_counts().sort_index().tolist()}")

# ================================================
# ✅ LOAD MODELS
# ================================================
# Load BanglishBERT for text
bert_tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglishbert")
bert_model = AutoModel.from_pretrained("csebuetnlp/banglishbert")

# Load Swin Transformer for images
swin_model_name = "microsoft/swin-base-patch4-window7-224"
image_processor = AutoImageProcessor.from_pretrained(swin_model_name)
swin_backbone = SwinForImageClassification.from_pretrained(
    swin_model_name,
    num_labels=3,
    ignore_mismatched_sizes=True
)

# ================================================
# ✅ MULTIMODAL FUSION MODEL
# ================================================
class MultimodalFusionModel(nn.Module):
    def __init__(self, bert_model, swin_model, num_classes=3, dropout_rate=0.3, fusion_dim=512):
        super().__init__()
        
        # Text encoder
        self.bert = bert_model
        self.text_dropout = nn.Dropout(dropout_rate)
        self.text_projector = nn.Linear(bert_model.config.hidden_size, fusion_dim)
        
        # Image encoder - use Swin backbone without classifier
        self.swin_backbone = swin_model.swin
        self.image_dropout = nn.Dropout(dropout_rate)
        self.image_projector = nn.Linear(swin_model.config.hidden_size, fusion_dim)
        
        # Fusion layers
        self.fusion_dropout = nn.Dropout(dropout_rate)
        self.fusion_layer1 = nn.Linear(fusion_dim * 2, fusion_dim)
        self.fusion_layer2 = nn.Linear(fusion_dim, fusion_dim // 2)
        self.batch_norm = nn.BatchNorm1d(fusion_dim // 2)
        
        # Classification head
        self.classifier = nn.Linear(fusion_dim // 2, num_classes)
        
        # Attention mechanism for fusion
        self.attention_weights = nn.Linear(fusion_dim * 2, 2)
        
    def forward(self, input_ids, attention_mask, pixel_values):
        # Text encoding
        text_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0, :]  # [CLS] token
        text_features = self.text_dropout(text_features)
        text_projected = self.text_projector(text_features)
        
        # Image encoding
        image_outputs = self.swin_backbone(pixel_values)
        image_features = image_outputs.last_hidden_state.mean(dim=1)  # Global average pooling
        image_features = self.image_dropout(image_features)
        image_projected = self.image_projector(image_features)
        
        # Concatenate features
        combined_features = torch.cat([text_projected, image_projected], dim=1)
        
        # Attention-based fusion
        attention_scores = F.softmax(self.attention_weights(combined_features), dim=1)
        text_att = attention_scores[:, 0:1]
        image_att = attention_scores[:, 1:2]
        
        # Weighted fusion
        fused_features = text_att * text_projected + image_att * image_projected
        
        # Additional fusion processing
        fusion_out = F.relu(self.fusion_layer1(combined_features))
        fusion_out = self.fusion_dropout(fusion_out)
        fusion_out = F.relu(self.fusion_layer2(fusion_out))
        fusion_out = self.batch_norm(fusion_out)
        
        # Classification
        logits = self.classifier(fusion_out)
        
        return logits

# ================================================
# ✅ MULTIMODAL DATASET
# ================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, tokenizer, processor, max_length=128, is_train=False):
        self.df = df
        self.tokenizer = tokenizer
        self.processor = processor
        self.max_length = max_length
        self.is_train = is_train
        # Define augmentations for training
        self.train_transforms = T.Compose([
            T.RandomRotation(15),
            T.RandomHorizontalFlip(),
            T.ColorJitter(brightness=0.3, contrast=0.3),
            T.RandomAdjustSharpness(sharpness_factor=2),
            # You can add more or adjust parameters as needed
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Text processing
        caption = row['Captions']
        text_inputs = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        # Image processing
        image = Image.open(row['Image_path']).convert('RGB')
        if self.is_train:
            image = self.train_transforms(image)
        image_inputs = self.processor(image, return_tensors="pt")
        return {
            'input_ids': text_inputs['input_ids'].flatten(),
            'attention_mask': text_inputs['attention_mask'].flatten(),
            'pixel_values': image_inputs['pixel_values'].squeeze(0),
            'label': torch.tensor(row['label'], dtype=torch.long)
        }

# ================================================
# ✅ DATALOADERS
# ================================================
batch_size = 8

train_dataset = MultimodalDataset(train_df, bert_tokenizer, image_processor, is_train=True)
val_dataset = MultimodalDataset(val_df, bert_tokenizer, image_processor, is_train=False)
test_dataset = MultimodalDataset(test_df, bert_tokenizer, image_processor, is_train=False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ================================================
# ✅ INITIALIZE MODEL
# ================================================
model = MultimodalFusionModel(bert_model, swin_backbone, num_classes=3, dropout_rate=0.3).to(device)

# ================================================
# ✅ LOSS & OPTIMIZER WITH ADVANCED TECHNIQUES
# ================================================
# Focal Loss for handling class imbalance
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        
        if self.alpha is not None:
            alpha_t = self.alpha[targets]
            focal_loss = alpha_t * focal_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

# Calculate class weights
class_counts = train_df['label'].value_counts().sort_index().tolist()
total_samples = sum(class_counts)
class_weights = [total_samples / count for count in class_counts]
alpha = torch.FloatTensor(class_weights).to(device)

# Use Focal Loss for better handling of class imbalance
criterion = FocalLoss(alpha=alpha, gamma=2.0)

# Optimizer with different learning rates for different parts
text_params = list(model.bert.parameters())
image_params = list(model.swin_backbone.parameters())
fusion_params = list(model.text_projector.parameters()) + list(model.image_projector.parameters()) + \
               list(model.fusion_layer1.parameters()) + list(model.fusion_layer2.parameters()) + \
               list(model.classifier.parameters()) + list(model.attention_weights.parameters())

optimizer = AdamW([
    {'params': text_params, 'lr': 2e-5},
    {'params': image_params, 'lr': 1e-5},
    {'params': fusion_params, 'lr': 5e-4}
], weight_decay=0.01)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20, eta_min=1e-6)

# ================================================
# ✅ TRAINING LOOP WITH ADVANCED TECHNIQUES
# ================================================
num_epochs = 25
patience = 3
patience_counter = 0
best_val_f1 = 0.0

print("🚀 Starting Multimodal Fusion Training...")

for epoch in range(num_epochs):
    # ============================================================
    # TRAINING PHASE
    # ============================================================
    model.train()
    total_train_loss = 0
    train_predictions = []
    train_labels = []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        logits = model(input_ids, attention_mask, pixel_values)
        loss = criterion(logits, labels)
        
        loss.backward()
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_train_loss += loss.item()
        
        predictions = torch.argmax(logits, dim=1)
        train_predictions.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_predictions)
    train_f1 = precision_recall_fscore_support(train_labels, train_predictions, average='weighted')[2]

    # ============================================================
    # VALIDATION PHASE
    # ============================================================
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['label'].to(device)
            
            logits = model(input_ids, attention_mask, pixel_values)
            loss = criterion(logits, labels)
            
            total_val_loss += loss.item()
            
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='weighted')
    
    # Step scheduler
    scheduler.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"  Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f} | Train F1: {train_f1:.4f}")
    print(f"  Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f} | Val F1: {val_f1:.4f}")
    print(f"  LR: {optimizer.param_groups[0]['lr']:.6f}")

    # ============================================================
    # EARLY STOPPING BASED ON F1 SCORE
    # ============================================================
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal_model.pt")
        print(f"✅ Validation F1 improved to {val_f1:.4f} — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break
    print("-" * 70)

# ================================================
# ✅ FINAL TEST EVALUATION
# ================================================
print("\n🔍 Loading best model for final evaluation...")
model.load_state_dict(torch.load("best_multimodal_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['label'].to(device)
        
        logits = model(input_ids, attention_mask, pixel_values)
        loss = criterion(logits, labels)
        
        total_test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Calculate comprehensive metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
test_precision_macro, test_recall_macro, test_f1_macro, _ = precision_recall_fscore_support(test_labels, test_predictions, average='macro')
cm = confusion_matrix(test_labels, test_predictions)

# Per-class metrics
precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
    test_labels, test_predictions, average=None
)

print("\n" + "="*70)
print("🎯 FINAL MULTIMODAL FUSION TEST RESULTS")
print("="*70)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1-Score (Weighted): {test_f1:.4f}")
print(f"Test F1-Score (Macro): {test_f1_macro:.4f}")
print(f"Test Precision (Weighted): {test_precision:.4f}")
print(f"Test Recall (Weighted): {test_recall:.4f}")
print(f"Test Loss: {total_test_loss/len(test_loader):.4f}")

print("\n📈 Per-Class Metrics:")
class_names = ['Negative', 'Neutral', 'Positive']
for i, class_name in enumerate(class_names):
    print(f"{class_name:>8}: Precision={precision_per_class[i]:.4f}, Recall={recall_per_class[i]:.4f}, F1={f1_per_class[i]:.4f}, Support={support[i]}")

print(f"\n🎯 Confusion Matrix:")
print(f"{'':>10} {'Neg':>6} {'Neu':>6} {'Pos':>6}")
for i, class_name in enumerate(['Negative', 'Neutral', 'Positive']):
    print(f"{class_name:>10} {cm[i][0]:>6} {cm[i][1]:>6} {cm[i][2]:>6}")

print("\n📋 Detailed Classification Report:")
print(classification_report(test_labels, test_predictions, target_names=class_names))

# ================================================
# ✅ SAVE RESULTS
# ================================================
results = {
    'test_accuracy': test_accuracy,
    'test_f1_weighted': test_f1,
    'test_f1_macro': test_f1_macro,
    'test_precision_weighted': test_precision,
    'test_recall_weighted': test_recall,
    'test_loss': total_test_loss/len(test_loader),
    'confusion_matrix': cm.tolist(),
    'per_class_metrics': {
        'precision': precision_per_class.tolist(),
        'recall': recall_per_class.tolist(),
        'f1': f1_per_class.tolist(),
        'support': support.tolist()
    }
}

with open('/kaggle/working/multimodal_fusion_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\n" + "="*70)
print("✅ MULTIMODAL FUSION MODEL TRAINING COMPLETE!")
print(f"🏆 Best F1 Score Achieved: {test_f1:.4f}")
print("📁 Results saved to 'multimodal_fusion_results.json'")
print("="*70)

2025-07-09 08:57:06.670377: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752051426.851326      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752051426.909707      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda
Train samples: 3156, Val samples: 451, Test samples: 902
Class distribution: [1404, 1237, 515]


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/352M [00:00<?, ?B/s]

Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-base-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 1024]) in the checkpoint and torch.Size([3, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Starting Multimodal Fusion Training...


Train Epoch 1: 100%|██████████| 395/395 [07:55<00:00,  1.20s/it]
Validation Epoch 1: 100%|██████████| 57/57 [00:25<00:00,  2.22it/s]


Epoch [1/25]
  Train Loss: 1.4456 | Train Acc: 0.5067 | Train F1: 0.5196
  Val Loss: 1.0602 | Val Acc: 0.6984 | Val F1: 0.6999
  LR: 0.000020
✅ Validation F1 improved to 0.6999 — model saved.
----------------------------------------------------------------------


Train Epoch 2: 100%|██████████| 395/395 [07:34<00:00,  1.15s/it]
Validation Epoch 2: 100%|██████████| 57/57 [00:23<00:00,  2.44it/s]


Epoch [2/25]
  Train Loss: 1.0981 | Train Acc: 0.6708 | Train F1: 0.6766
  Val Loss: 0.9414 | Val Acc: 0.7095 | Val F1: 0.7111
  LR: 0.000020
✅ Validation F1 improved to 0.7111 — model saved.
----------------------------------------------------------------------


Train Epoch 3: 100%|██████████| 395/395 [07:33<00:00,  1.15s/it]
Validation Epoch 3: 100%|██████████| 57/57 [00:23<00:00,  2.47it/s]


Epoch [3/25]
  Train Loss: 0.8106 | Train Acc: 0.7785 | Train F1: 0.7815
  Val Loss: 1.1452 | Val Acc: 0.6785 | Val F1: 0.6876
  LR: 0.000019
⏰ No improvement — patience 1/3
----------------------------------------------------------------------


Train Epoch 4: 100%|██████████| 395/395 [07:29<00:00,  1.14s/it]
Validation Epoch 4: 100%|██████████| 57/57 [00:23<00:00,  2.45it/s]


Epoch [4/25]
  Train Loss: 0.6434 | Train Acc: 0.8498 | Train F1: 0.8501
  Val Loss: 1.3976 | Val Acc: 0.7539 | Val F1: 0.7527
  LR: 0.000018
✅ Validation F1 improved to 0.7527 — model saved.
----------------------------------------------------------------------


Train Epoch 5: 100%|██████████| 395/395 [07:26<00:00,  1.13s/it]
Validation Epoch 5: 100%|██████████| 57/57 [00:22<00:00,  2.50it/s]


Epoch [5/25]
  Train Loss: 0.5698 | Train Acc: 0.9011 | Train F1: 0.9010
  Val Loss: 2.1791 | Val Acc: 0.7428 | Val F1: 0.7459
  LR: 0.000017
⏰ No improvement — patience 1/3
----------------------------------------------------------------------


Train Epoch 6: 100%|██████████| 395/395 [07:28<00:00,  1.14s/it]
Validation Epoch 6: 100%|██████████| 57/57 [00:22<00:00,  2.52it/s]


Epoch [6/25]
  Train Loss: 0.4784 | Train Acc: 0.9363 | Train F1: 0.9362
  Val Loss: 2.3220 | Val Acc: 0.7029 | Val F1: 0.7079
  LR: 0.000016
⏰ No improvement — patience 2/3
----------------------------------------------------------------------


Train Epoch 7: 100%|██████████| 395/395 [07:28<00:00,  1.13s/it]
Validation Epoch 7: 100%|██████████| 57/57 [00:22<00:00,  2.53it/s]


Epoch [7/25]
  Train Loss: 0.2582 | Train Acc: 0.9632 | Train F1: 0.9632
  Val Loss: 3.0892 | Val Acc: 0.7494 | Val F1: 0.7461
  LR: 0.000015
⏰ No improvement — patience 3/3
🛑 Early stopping triggered at epoch 7

🔍 Loading best model for final evaluation...


Final Test Evaluation: 100%|██████████| 113/113 [00:48<00:00,  2.34it/s]


🎯 FINAL MULTIMODAL FUSION TEST RESULTS
Test Accuracy: 0.7627
Test F1-Score (Weighted): 0.7624
Test F1-Score (Macro): 0.7265
Test Precision (Weighted): 0.7821
Test Recall (Weighted): 0.7627
Test Loss: 1.2047

📈 Per-Class Metrics:
Negative: Precision=0.9100, Recall=0.7040, F1=0.7938, Support=402
 Neutral: Precision=0.7185, Recall=0.9037, F1=0.8005, Support=353
Positive: Precision=0.5850, Recall=0.5850, F1=0.5850, Support=147

🎯 Confusion Matrix:
              Neg    Neu    Pos
  Negative    283     79     40
   Neutral     13    319     21
  Positive     15     46     86

📋 Detailed Classification Report:
              precision    recall  f1-score   support

    Negative       0.91      0.70      0.79       402
     Neutral       0.72      0.90      0.80       353
    Positive       0.59      0.59      0.59       147

    accuracy                           0.76       902
   macro avg       0.74      0.73      0.73       902
weighted avg       0.78      0.76      0.76       902


✅ MULT


