In [1]:
# ===============================================================
# ✅ LIBRARIES & SETUP
# ===============================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor, SwinForImageClassification
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import torch.nn as nn
import re
import string

# ===============================================================
# ✅ PATHS
# ===============================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ===============================================================
# ✅ LOAD & PREPROCESS CSV
# ===============================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# Text cleaning
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

processed_df['Captions'] = processed_df['Captions'].astype(str).apply(clean_text)

# ===============================================================
# ✅ DATA SPLITS
# ===============================================================
train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_cleaned.csv', index=False)

# ===============================================================
# ✅ LOAD MODELS
# ===============================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Swin Transformer
swin_model_name = "microsoft/swin-base-patch4-window7-224"
processor = AutoImageProcessor.from_pretrained(swin_model_name)
swin_model = SwinForImageClassification.from_pretrained(
    swin_model_name,
    num_labels=3,
    ignore_mismatched_sizes=True
)

# Text: IndicDistilBERT
bert_model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

# ===============================================================
# ✅ MULTIMODAL DATASET
# ===============================================================
class MultiModalDataset(Dataset):
    def __init__(self, df, processor, tokenizer, max_length=128):
        self.df = df
        self.processor = processor
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['Image_path']).convert('RGB')
        text = row['Captions']
        label = row['label']

        # Image
        image_inputs = self.processor(image, return_tensors="pt")
        pixel_values = image_inputs['pixel_values'].squeeze(0)

        # Text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return pixel_values, input_ids, attention_mask, label

def collate_fn(batch):
    pixel_values, input_ids, attention_mask, labels = zip(*batch)
    return (torch.stack(pixel_values),
            torch.stack(input_ids),
            torch.stack(attention_mask),
            torch.tensor(labels))

# ===============================================================
# ✅ DATALOADERS
# ===============================================================
batch_size = 8

train_loader = DataLoader(MultiModalDataset(train_df, processor, tokenizer), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(MultiModalDataset(val_df, processor, tokenizer), batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(MultiModalDataset(test_df, processor, tokenizer), batch_size=batch_size, collate_fn=collate_fn)

# ===============================================================
# ✅ MULTIMODAL MODEL
# ===============================================================
class MultiModalFusion(nn.Module):
    def __init__(self, swin_model, bert_model, num_classes=3):
        super().__init__()
        self.swin = swin_model
        self.bert = bert_model
        self.swin.classifier = nn.Identity()  # remove image head
        self.classifier = nn.Sequential(
            nn.Linear(self.swin.config.hidden_size + self.bert.config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, pixel_values, input_ids, attention_mask):
        image_feats = self.swin(pixel_values=pixel_values).logits
        text_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_feats = text_outputs.last_hidden_state[:, 0, :]  # [CLS]
        fused = torch.cat((image_feats, text_feats), dim=1)
        logits = self.classifier(fused)
        return logits

model = MultiModalFusion(swin_model, bert_model).to(device)

# ===============================================================
# ✅ LOSS & OPTIMIZER
# ===============================================================
class_counts = train_df['label'].value_counts().sort_index().tolist()
total = sum(class_counts)
weights = [total / c for c in class_counts]
print(f"Class distribution: {class_counts}")
print(f"Class weights: {weights}")

criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ===============================================================
# ✅ TRAINING LOOP
# ===============================================================
num_epochs = 20
patience = 3
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_preds, train_labels = [], []

    for pixel_values, input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        pixel_values = pixel_values.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(pixel_values, input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = train_loss / len(train_loader)
    train_acc = accuracy_score(train_labels, train_preds)

    model.eval()
    val_loss = 0
    val_preds, val_labels = [], []

    with torch.no_grad():
        for pixel_values, input_ids, attention_mask, labels in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            pixel_values = pixel_values.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(pixel_values, input_ids, attention_mask)
            loss = criterion(logits, labels)

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_acc = accuracy_score(val_labels, val_preds)

    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal.pt")
        print("✅ Saved best model.")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("🛑 Early stopping.")
            break

# ===============================================================
# ✅ FINAL EVALUATION
# ===============================================================
model.load_state_dict(torch.load("best_multimodal.pt"))
model.eval()

test_preds, test_labels = [], []
test_loss = 0

with torch.no_grad():
    for pixel_values, input_ids, attention_mask, labels in tqdm(test_loader, desc="Testing"):
        pixel_values = pixel_values.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        logits = model(pixel_values, input_ids, attention_mask)
        loss = criterion(logits, labels)

        test_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')
cm = confusion_matrix(test_labels, test_preds)

print("\n" + "="*60)
print("📊 FINAL TEST RESULTS - MULTIMODAL")
print("="*60)
print(f"Accuracy: {test_acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Weighted F1: {f1:.4f}")
print(f"Loss: {test_loss/len(test_loader):.4f}")
print(f"Confusion Matrix:\n{cm}")
print("\nClassification Report:")
print(classification_report(test_labels, test_preds, target_names=['Negative', 'Neutral', 'Positive']))


2025-07-08 07:48:17.892965: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751960898.094357      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751960898.155289      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


preprocessor_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/352M [00:00<?, ?B/s]

Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-base-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 1024]) in the checkpoint and torch.Size([3, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Class distribution: [1404, 1237, 515]
Class weights: [2.247863247863248, 2.551333872271625, 6.128155339805825]


Train Epoch 1: 100%|██████████| 395/395 [04:09<00:00,  1.58it/s]
Val Epoch 1: 100%|██████████| 57/57 [00:23<00:00,  2.39it/s]


Epoch [1/20]
Train Loss: 0.8463 | Train Acc: 0.6343
Val Loss: 0.8132 | Val Acc: 0.6874
✅ Saved best model.


Train Epoch 2: 100%|██████████| 395/395 [03:53<00:00,  1.69it/s]
Val Epoch 2: 100%|██████████| 57/57 [00:21<00:00,  2.62it/s]


Epoch [2/20]
Train Loss: 0.5456 | Train Acc: 0.7906
Val Loss: 0.8236 | Val Acc: 0.6674


Train Epoch 3: 100%|██████████| 395/395 [03:52<00:00,  1.70it/s]
Val Epoch 3: 100%|██████████| 57/57 [00:21<00:00,  2.61it/s]


Epoch [3/20]
Train Loss: 0.2786 | Train Acc: 0.9056
Val Loss: 1.0457 | Val Acc: 0.7007


Train Epoch 4: 100%|██████████| 395/395 [03:53<00:00,  1.69it/s]
Val Epoch 4: 100%|██████████| 57/57 [00:21<00:00,  2.62it/s]


Epoch [4/20]
Train Loss: 0.1230 | Train Acc: 0.9623
Val Loss: 1.2953 | Val Acc: 0.6851
🛑 Early stopping.


Testing: 100%|██████████| 113/113 [00:45<00:00,  2.46it/s]


📊 FINAL TEST RESULTS - MULTIMODAL
Accuracy: 0.7162
Precision: 0.7733
Recall: 0.7162
Weighted F1: 0.7079
Loss: 0.7422
Confusion Matrix:
[[255 135  12]
 [ 15 334   4]
 [  8  82  57]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.92      0.63      0.75       402
     Neutral       0.61      0.95      0.74       353
    Positive       0.78      0.39      0.52       147

    accuracy                           0.72       902
   macro avg       0.77      0.66      0.67       902
weighted avg       0.77      0.72      0.71       902




