In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoTokenizer, AutoModel
from torchvision import models, transforms
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    text = row['extracted_text']
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if pd.notna(text) and text.strip() and os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'text': text,
            'image': full_image_path,
            'label': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ SPLIT DATA
# ================================================
train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['label'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['label'], random_state=42)

# ================================================
# ✅ 5️⃣ TEXT CLEANING
# ================================================
def clean_text(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return " ".join(text.split())

for df_ in [train_df, val_df, test_df]:
    df_['text'] = df_['text'].astype(str).apply(clean_text)

# ================================================
# ✅ 6️⃣ DEVICE & TRANSFORMS
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

# ================================================
# ✅ 7️⃣ TOKENIZER & MODELS
# ================================================
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril = AutoModel.from_pretrained("google/muril-base-cased").to(device)

vision_model = models.densenet161(pretrained=True)
vision_feature_dim = vision_model.classifier.in_features
vision_model.classifier = nn.Identity()
vision_model = vision_model.to(device)

# Freeze feature extractors if you want (optional):
# for param in muril.parameters(): param.requires_grad = False
# for param in vision_model.parameters(): param.requires_grad = False

# ================================================
# ✅ 8️⃣ MULTIMODAL DATASET
# ================================================
class MultiModalDataset(Dataset):
    def __init__(self, df, tokenizer, transform, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.max_length = max_length

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['text']
        image = Image.open(row['image']).convert('RGB')
        label = row['label']

        image = self.transform(image)
        encoded = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'image': image,
            'label': torch.tensor(label, dtype=torch.long)
        }

# ================================================
# ✅ 9️⃣ MULTIMODAL MODEL
# ================================================
class MultiModalNet(nn.Module):
    def __init__(self, text_model, image_model, hidden_size, num_classes=3):
        super().__init__()
        self.text_model = text_model
        self.image_model = image_model
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size + vision_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, input_ids, attention_mask, image):
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_feat = text_outputs.last_hidden_state[:,0,:]
        image_feat = self.image_model(image)
        combined = torch.cat((text_feat, image_feat), dim=1)
        return self.classifier(combined)

# ================================================
# ✅ 🔟 TRAINING SETUP
# ================================================
batch_size = 16
num_epochs = 20
patience = 3
hidden_size = muril.config.hidden_size

model = MultiModalNet(muril, vision_model, hidden_size).to(device)

train_dataset = MultiModalDataset(train_df, tokenizer, image_transform)
val_dataset = MultiModalDataset(val_df, tokenizer, image_transform)
test_dataset = MultiModalDataset(test_df, tokenizer, image_transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

# Class weights
class_counts = train_df['label'].value_counts().sort_index().tolist()
total = sum(class_counts)
weights = [total / c for c in class_counts]
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ================================================
# ✅ 1️⃣1️⃣ TRAINING LOOP
# ================================================
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    total_train_loss, train_preds, train_labels = 0, [], []

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Train"):
        optimizer.zero_grad()
        ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        imgs = batch['image'].to(device)
        labels = batch['label'].to(device)

        outputs = model(ids, mask, imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        preds = outputs.argmax(dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)
    avg_train_loss = total_train_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss, val_preds, val_labels = 0, [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Val"):
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            imgs = batch['image'].to(device)
            labels = batch['label'].to(device)

            outputs = model(ids, mask, imgs)
            loss = criterion(outputs, labels)

            total_val_loss += loss.item()
            preds = outputs.argmax(dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_acc = accuracy_score(val_labels, val_preds)
    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} Acc: {train_acc:.4f} | Val Loss: {avg_val_loss:.4f} Acc: {val_acc:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal.pt")
        print("✅ Saved best model.")
    else:
        patience_counter += 1
        print(f"⏰ Patience {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("🛑 Early stopping.")
            break

# ================================================
# ✅ 1️⃣2️⃣ TEST EVALUATION
# ================================================
model.load_state_dict(torch.load("best_multimodal.pt"))
model.eval()
test_preds, test_labels_list, total_test_loss = [], [], 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test"):
        ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        imgs = batch['image'].to(device)
        labels = batch['label'].to(device)

        outputs = model(ids, mask, imgs)
        loss = criterion(outputs, labels)
        total_test_loss += loss.item()

        preds = outputs.argmax(dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_labels_list.extend(labels.cpu().numpy())

acc = accuracy_score(test_labels_list, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels_list, test_preds, average='weighted')
cm = confusion_matrix(test_labels_list, test_preds)

print("\n📊 FINAL TEST RESULTS:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Weighted F1: {f1:.4f}")
print(f"Confusion Matrix:\n{cm}")

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

2025-07-08 05:52:45.395232: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751953965.568469      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751953965.614567      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/densenet161-8d451a50.pth" to /root/.cache/torch/hub/checkpoints/densenet161-8d451a50.pth

  0%|          | 0.00/110M [00:00<?, ?B/s][A
  7%|▋         | 8.25M/110M [00:00<00:01, 85.8MB/s][A
 15%|█▌        | 16.6M/110M [00:00<00:01, 86.9MB/s][A
 23%|██▎       | 25.8M/110M [00:00<00:00, 90.2MB/s][A
 31%|███       | 34.4M/110M [00:00<00:01, 75.6MB/s][A
 38%|███▊      | 42.2M/110M [00:00<00:00, 77.5MB/s][A
 46%|████▌     | 50.8M/110M [00:00<00:00, 81.0MB/s][A
 53%|█████▎    | 58.8M/110M [00:00<00:00, 81.0MB/s][A
 61%|██████    | 67.4M/110M [00:00<00:00, 83.6MB/s][A
 70%|██████▉   | 77.2M/110M [00:00<00:00, 89.5MB/s][A
 78%|███████▊  | 85.9M/110M [00:01<00:00, 78.4MB/s][A
 85%|████████▍ | 93.8M/110M [00:01<00:00, 79.4MB/s][A
 92%|█████████▏| 102M/110M [00:01<00:00, 78.9MB/s] [A
100%|██████████| 110M/110M [00:01<00:00, 79.3MB/s]

Epoch 1 Train:   0%|          | 0/198 [00:00<?, ?it/s][A
Epoch 1 Train:   1%|          | 1/198 [00:04

Epoch 1/20 | Train Loss: 0.9698 Acc: 0.5513 | Val Loss: 0.8737 Acc: 0.6319
✅ Saved best model.


Epoch 2 Train: 100%|██████████| 198/198 [01:28<00:00,  2.25it/s]
Epoch 2 Val: 100%|██████████| 29/29 [00:06<00:00,  4.32it/s]


Epoch 2/20 | Train Loss: 0.7598 Acc: 0.6797 | Val Loss: 0.7857 Acc: 0.6652
✅ Saved best model.


Epoch 3 Train: 100%|██████████| 198/198 [01:28<00:00,  2.25it/s]
Epoch 3 Val: 100%|██████████| 29/29 [00:06<00:00,  4.37it/s]


Epoch 3/20 | Train Loss: 0.5278 Acc: 0.7988 | Val Loss: 0.7965 Acc: 0.7228
⏰ Patience 1/3


Epoch 4 Train: 100%|██████████| 198/198 [01:28<00:00,  2.25it/s]
Epoch 4 Val: 100%|██████████| 29/29 [00:06<00:00,  4.26it/s]


Epoch 4/20 | Train Loss: 0.3153 Acc: 0.9002 | Val Loss: 0.8574 Acc: 0.7162
⏰ Patience 2/3


Epoch 5 Train: 100%|██████████| 198/198 [01:28<00:00,  2.25it/s]
Epoch 5 Val: 100%|██████████| 29/29 [00:06<00:00,  4.24it/s]


Epoch 5/20 | Train Loss: 0.1892 Acc: 0.9433 | Val Loss: 0.9860 Acc: 0.6984
⏰ Patience 3/3
🛑 Early stopping.


Test: 100%|██████████| 57/57 [00:15<00:00,  3.74it/s]


📊 FINAL TEST RESULTS:
Accuracy: 0.7018
Precision: 0.7205
Recall: 0.7018
Weighted F1: 0.7076
Confusion Matrix:
[[296  60  46]
 [ 55 238  60]
 [ 15  33  99]]



