In [1]:
# =============================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# =============================================================
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import models, transforms
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string

# =============================================================
# ✅ 2️⃣ DEVICE
# =============================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =============================================================
# ✅ 3️⃣ DATA PATHS
# =============================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# =============================================================
# ✅ 4️⃣ LOAD DATA
# =============================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label': label_converted
        })

processed_df = pd.DataFrame(existing_data)

train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label'], random_state=42)

# =============================================================
# ✅ 5️⃣ TEXT TOKENIZER & IMAGE TRANSFORMS
# =============================================================
bert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

transform_train = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
transform_val = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# =============================================================
# ✅ 6️⃣ MULTIMODAL DATASET
# =============================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, tokenizer, transform, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['Image_path']).convert('RGB')
        image = self.transform(image)

        caption = row['Captions']
        caption = re.sub(r'https?://\S+|www\.\S+', '', caption)
        caption = re.sub(r'<.*?>', '', caption)
        caption = caption.translate(str.maketrans('', '', string.punctuation))
        caption = " ".join(caption.split())

        inputs = self.tokenizer(
            caption, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt'
        )

        label = torch.tensor(row['Label'], dtype=torch.long)

        return {
            'image': image,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': label
        }

# =============================================================
# ✅ 7️⃣ DATALOADERS
# =============================================================
batch_size = 16
train_ds = MultimodalDataset(train_df, bert_tokenizer, transform_train)
val_ds = MultimodalDataset(val_df, bert_tokenizer, transform_val)
test_ds = MultimodalDataset(test_df, bert_tokenizer, transform_val)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=batch_size, num_workers=2)
test_loader = DataLoader(test_ds, batch_size=batch_size, num_workers=2)

# =============================================================
# ✅ 8️⃣ MULTIMODAL MODEL
# =============================================================
class MultimodalClassifier(nn.Module):
    def __init__(self, text_model, vision_model, num_classes=3):
        super().__init__()
        self.text_model = text_model
        self.vision_model = vision_model

        text_hidden = text_model.config.hidden_size
        vision_hidden = vision_model.classifier.in_features

        self.vision_model.classifier = nn.Identity()

        self.classifier = nn.Sequential(
            nn.Linear(text_hidden + vision_hidden, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, input_ids, attention_mask, images):
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_emb = text_outputs.last_hidden_state[:, 0, :]

        vision_emb = self.vision_model(images)

        combined = torch.cat((text_emb, vision_emb), dim=1)
        logits = self.classifier(combined)
        return logits

# Load pretrained components
text_model = AutoModel.from_pretrained("sagorsarker/bangla-bert-base").to(device)
vision_model = models.densenet161(pretrained=True)

model = MultimodalClassifier(text_model, vision_model, num_classes=3).to(device)

# =============================================================
# ✅ 9️⃣ LOSS, OPTIMIZER, SCHEDULER
# =============================================================
class_weights = train_df['Label'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

# =============================================================
# ✅ 🔟 TRAINING LOOP
# =============================================================
num_epochs = 20
patience = 3
patience_counter = 0
best_val_loss = float('inf')

print(f"Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}")

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    preds, labels = [], []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        images = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels_batch = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        labels.extend(labels_batch.cpu().numpy())

    avg_train_loss = total_loss / len(train_loader)
    train_acc = accuracy_score(labels, preds)

    model.eval()
    val_loss = 0
    val_preds, val_labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_batch = batch['label'].to(device)

            outputs = model(input_ids, attention_mask, images)
            loss = criterion(outputs, labels_batch)
            val_loss += loss.item()

            val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            val_labels.extend(labels_batch.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_acc = accuracy_score(val_labels, val_preds)
    scheduler.step()

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f}, Acc: {train_acc:.4f} | Val Loss: {avg_val_loss:.4f}, Acc: {val_acc:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal.pt")
        print("✅ Saved best model")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("🛑 Early stopping")
            break

# =============================================================
# ✅ 1️⃣1️⃣ FINAL TEST
# =============================================================
model.load_state_dict(torch.load("best_multimodal.pt"))
model.eval()
test_preds, test_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test"):
        images = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels_batch = batch['label'].to(device)

        outputs = model(input_ids, attention_mask, images)
        test_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        test_labels.extend(labels_batch.cpu().numpy())

acc = accuracy_score(test_labels, test_preds)
prec, rec, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')
cm = confusion_matrix(test_labels, test_preds)

print(f"\n📊 FINAL MULTIMODAL TEST RESULTS\nAccuracy: {acc:.4f}\nPrecision: {prec:.4f}\nRecall: {rec:.4f}\nF1: {f1:.4f}\nConfusion Matrix:\n{cm}")


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

2025-07-08 06:32:07.860001: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751956328.082547      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751956328.156186      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/densenet161-8d451a50.pth" to /root/.cache/torch/hub/checkpoints/densenet161-8d451a50.pth
100%|██████████| 110M/110M [00:00<00:00, 229MB/s] 


Train: 3156, Val: 451, Test: 902


Train Epoch 1: 100%|██████████| 198/198 [01:27<00:00,  2.25it/s]
Val Epoch 1: 100%|██████████| 29/29 [00:08<00:00,  3.44it/s]


Epoch 1 | Train Loss: 0.8863, Acc: 0.6017 | Val Loss: 0.8176, Acc: 0.6563
✅ Saved best model


Train Epoch 2: 100%|██████████| 198/198 [01:26<00:00,  2.30it/s]
Val Epoch 2: 100%|██████████| 29/29 [00:07<00:00,  4.11it/s]


Epoch 2 | Train Loss: 0.6202, Acc: 0.7639 | Val Loss: 0.8034, Acc: 0.6984
✅ Saved best model


Train Epoch 3: 100%|██████████| 198/198 [01:26<00:00,  2.30it/s]
Val Epoch 3: 100%|██████████| 29/29 [00:06<00:00,  4.22it/s]


Epoch 3 | Train Loss: 0.3683, Acc: 0.8695 | Val Loss: 1.0281, Acc: 0.6497


Train Epoch 4: 100%|██████████| 198/198 [01:25<00:00,  2.31it/s]
Val Epoch 4: 100%|██████████| 29/29 [00:07<00:00,  4.06it/s]


Epoch 4 | Train Loss: 0.1876, Acc: 0.9312 | Val Loss: 1.2610, Acc: 0.6275


Train Epoch 5: 100%|██████████| 198/198 [01:26<00:00,  2.30it/s]
Val Epoch 5: 100%|██████████| 29/29 [00:08<00:00,  3.27it/s]


Epoch 5 | Train Loss: 0.0975, Acc: 0.9715 | Val Loss: 1.4752, Acc: 0.6718
🛑 Early stopping


Test: 100%|██████████| 57/57 [00:18<00:00,  3.07it/s]


📊 FINAL MULTIMODAL TEST RESULTS
Accuracy: 0.7461
Precision: 0.7547
Recall: 0.7461
F1: 0.7462
Confusion Matrix:
[[296  80  26]
 [ 35 292  26]
 [ 16  46  85]]



