In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import ViTImageProcessor, ViTModel, AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import torch.nn as nn
import re
import string

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ TEXT CLEANING
# ================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

processed_df['Captions'] = processed_df['Captions'].astype(str).apply(clean_text)

# ================================================
# ✅ 5️⃣ SPLIT DATA
# ================================================
train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

# ================================================
# ✅ 6️⃣ LOAD MODELS
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vit_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224").to(device)
bert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")
bert_model = AutoModel.from_pretrained("sagorsarker/bangla-bert-base").to(device)

# ================================================
# ✅ 7️⃣ DATASET CLASS
# ================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, vit_processor, bert_tokenizer, max_length=128):
        self.df = df
        self.vit_processor = vit_processor
        self.bert_tokenizer = bert_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['Image_path']).convert('RGB')
        caption = row['Captions']
        label = row['Label_Sentiment']

        image_inputs = self.vit_processor(image, return_tensors="pt")
        pixel_values = image_inputs['pixel_values'].squeeze(0)

        text_inputs = self.bert_tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = text_inputs['input_ids'].squeeze(0)
        attention_mask = text_inputs['attention_mask'].squeeze(0)

        return pixel_values, input_ids, attention_mask, label

def collate_fn(batch):
    pixel_values, input_ids, attention_mask, labels = zip(*batch)
    pixel_values = torch.stack(pixel_values)
    input_ids = torch.stack(input_ids)
    attention_mask = torch.stack(attention_mask)
    labels = torch.tensor(labels)
    return pixel_values, input_ids, attention_mask, labels

# ================================================
# ✅ 8️⃣ DATALOADERS
# ================================================
batch_size = 8

train_loader = DataLoader(MultimodalDataset(train_df, vit_processor, bert_tokenizer), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(MultimodalDataset(val_df, vit_processor, bert_tokenizer), batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(MultimodalDataset(test_df, vit_processor, bert_tokenizer), batch_size=batch_size, collate_fn=collate_fn)

# ================================================
# ✅ 9️⃣ MULTIMODAL MODEL
# ================================================
class MultimodalClassifier(nn.Module):
    def __init__(self, vit_model, bert_model, hidden_size=768, num_classes=3):
        super().__init__()
        self.vit = vit_model
        self.bert = bert_model

        self.fusion = nn.Linear(hidden_size * 2, hidden_size)
        self.classifier = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, pixel_values, input_ids, attention_mask):
        vision_outputs = self.vit(pixel_values=pixel_values)
        vision_embed = vision_outputs.last_hidden_state[:, 0, :]

        text_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_embed = text_outputs.last_hidden_state[:, 0, :]

        fused = torch.cat((vision_embed, text_embed), dim=1)
        fused = self.dropout(self.fusion(fused))
        logits = self.classifier(fused)

        return logits

# ================================================
# ✅ 🔟 INIT MODEL
# ================================================
model = MultimodalClassifier(vit_model, bert_model).to(device)

# ================================================
# ✅ 1️⃣1️⃣ LOSS & OPTIMIZER
# ================================================
class_weights = train_df['Label_Sentiment'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ================================================
# ✅ 1️⃣2️⃣ TRAINING LOOP
# ================================================
num_epochs = 10
patience = 3
patience_counter = 0
best_val_f1 = 0

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    train_preds, train_labels = [], []

    for pixel_values, input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        pixel_values = pixel_values.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        logits = model(pixel_values, input_ids, attention_mask)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)

    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for pixel_values, input_ids, attention_mask, labels in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            pixel_values = pixel_values.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(pixel_values, input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_f1 = precision_recall_fscore_support(val_labels, val_preds, average='weighted')[2]
    print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {avg_train_loss:.4f} | Val F1: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal_model.pt")
        print("✅ Val F1 improved — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")
        if patience_counter >= patience:
            print(f"🛑 Early stopping at epoch {epoch+1}")
            break

# ================================================
# ✅ 1️⃣3️⃣ TEST
# ================================================
model.load_state_dict(torch.load("best_multimodal_model.pt"))
model.eval()

test_preds, test_labels = [], []

with torch.no_grad():
    for pixel_values, input_ids, attention_mask, labels in tqdm(test_loader, desc="Test"):
        pixel_values = pixel_values.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        logits = model(pixel_values, input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_f1 = precision_recall_fscore_support(test_labels, test_preds, average='weighted')[2]
test_acc = accuracy_score(test_labels, test_preds)
print(f"Test Accuracy: {test_acc:.4f} | Test Weighted F1: {test_f1:.4f}")


2025-07-08 05:31:46.122497: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751952706.357950      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751952706.421394      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Train Epoch 1: 100%|██████████| 395/395 [04:08<00:00,  1.59it/s]
Val Epoch 1: 100%|██████████| 57/57 [00:23<00:00,  2.48it/s]


Epoch [1/10] Train Loss: 0.8662 | Val F1: 0.6674
✅ Val F1 improved — model saved.


Train Epoch 2: 100%|██████████| 395/395 [03:30<00:00,  1.88it/s]
Val Epoch 2: 100%|██████████| 57/57 [00:19<00:00,  2.99it/s]


Epoch [2/10] Train Loss: 0.4885 | Val F1: 0.6648
⏰ No improvement — patience 1/3


Train Epoch 3: 100%|██████████| 395/395 [03:31<00:00,  1.87it/s]
Val Epoch 3: 100%|██████████| 57/57 [00:19<00:00,  2.98it/s]


Epoch [3/10] Train Loss: 0.1950 | Val F1: 0.6600
⏰ No improvement — patience 2/3


Train Epoch 4: 100%|██████████| 395/395 [03:30<00:00,  1.88it/s]
Val Epoch 4: 100%|██████████| 57/57 [00:19<00:00,  2.97it/s]


Epoch [4/10] Train Loss: 0.0697 | Val F1: 0.7068
✅ Val F1 improved — model saved.


Train Epoch 5: 100%|██████████| 395/395 [03:31<00:00,  1.86it/s]
Val Epoch 5: 100%|██████████| 57/57 [00:19<00:00,  2.97it/s]


Epoch [5/10] Train Loss: 0.0433 | Val F1: 0.6835
⏰ No improvement — patience 1/3


Train Epoch 6: 100%|██████████| 395/395 [03:32<00:00,  1.86it/s]
Val Epoch 6: 100%|██████████| 57/57 [00:19<00:00,  2.92it/s]


Epoch [6/10] Train Loss: 0.0360 | Val F1: 0.7177
✅ Val F1 improved — model saved.


Train Epoch 7: 100%|██████████| 395/395 [03:33<00:00,  1.85it/s]
Val Epoch 7: 100%|██████████| 57/57 [00:19<00:00,  2.92it/s]


Epoch [7/10] Train Loss: 0.0279 | Val F1: 0.6970
⏰ No improvement — patience 1/3


Train Epoch 8: 100%|██████████| 395/395 [03:28<00:00,  1.89it/s]
Val Epoch 8: 100%|██████████| 57/57 [00:19<00:00,  2.98it/s]


Epoch [8/10] Train Loss: 0.0149 | Val F1: 0.7074
⏰ No improvement — patience 2/3


Train Epoch 9: 100%|██████████| 395/395 [03:32<00:00,  1.86it/s]
Val Epoch 9: 100%|██████████| 57/57 [00:19<00:00,  2.86it/s]


Epoch [9/10] Train Loss: 0.0134 | Val F1: 0.6835
⏰ No improvement — patience 3/3
🛑 Early stopping at epoch 9


Test: 100%|██████████| 113/113 [00:46<00:00,  2.44it/s]

Test Accuracy: 0.7472 | Test Weighted F1: 0.7439





In [2]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Compute all metrics
test_accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test Weighted F1: {f1:.4f}")


Test Accuracy: 0.7472
Test Precision: 0.7442
Test Recall: 0.7472
Test Weighted F1: 0.7439
