In [1]:
# ===================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ===================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, CLIPProcessor, CLIPModel
from PIL import Image
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string

# ===================================================
# ✅ 2️⃣ PATHS
# ===================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ===================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ===================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ===================================================
# ✅ 4️⃣ TEXT CLEANING
# ===================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_multimodal.csv', index=False)

# ===================================================
# ✅ 5️⃣ LOAD MODELS
# ===================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Text encoder — Bangla BERT (IndicDistilBERT)
text_model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(text_model_name)
bert_model = AutoModel.from_pretrained(text_model_name).to(device)

# Image encoder — CLIP
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

# ===================================================
# ✅ 6️⃣ MULTIMODAL DATASET
# ===================================================
class MultiModalDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['Image_path']).convert('RGB')
        text = row['Captions']
        label = row['label']

        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )

        return {
            'image': image,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def multimodal_collate_fn(batch):
    images = [item['image'] for item in batch]
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])
    return images, input_ids, attention_mask, labels

batch_size = 8

train_loader = DataLoader(MultiModalDataset(train_df, tokenizer), batch_size=batch_size, shuffle=True, collate_fn=multimodal_collate_fn)
val_loader = DataLoader(MultiModalDataset(val_df, tokenizer), batch_size=batch_size, collate_fn=multimodal_collate_fn)
test_loader = DataLoader(MultiModalDataset(test_df, tokenizer), batch_size=batch_size, collate_fn=multimodal_collate_fn)

# ===================================================
# ✅ 7️⃣ MULTIMODAL FUSION MODEL
# ===================================================
class MultiModalClassifier(torch.nn.Module):
    def __init__(self, text_model, image_feature_dim, num_classes=3):
        super().__init__()
        self.text_model = text_model
        self.image_dim = image_feature_dim
        self.text_dim = text_model.config.hidden_size
        self.dropout = torch.nn.Dropout(0.3)

        # Fusion layer: text + image
        self.classifier = torch.nn.Linear(self.text_dim + self.image_dim, num_classes)

    def forward(self, input_ids, attention_mask, img_features):
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        text_feat = text_outputs.last_hidden_state[:, 0, :]  # CLS token

        combined = torch.cat([text_feat, img_features], dim=1)
        combined = self.dropout(combined)
        logits = self.classifier(combined)
        return logits

# Get image embedding dim
dummy_image = Image.new('RGB', (224, 224))
dummy_img = clip_processor(images=dummy_image, return_tensors="pt").to(device)
img_dim = clip_model.get_image_features(**dummy_img).shape[1]

model = MultiModalClassifier(bert_model, img_dim).to(device)

# ===================================================
# ✅ 8️⃣ LOSS, OPTIMIZER, METRICS
# ===================================================
class_weights = train_df['label'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ===================================================
# ✅ 9️⃣ TRAINING LOOP — MULTIMODAL
# ===================================================
num_epochs = 20
patience = 3
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for images, input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        img_inputs = clip_processor(images=images, return_tensors="pt").to(device)
        img_features = clip_model.get_image_features(**img_inputs)

        logits = model(input_ids.to(device), attention_mask.to(device), img_features)
        loss = criterion(logits, labels.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for images, input_ids, attention_mask, labels in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            img_inputs = clip_processor(images=images, return_tensors="pt").to(device)
            img_features = clip_model.get_image_features(**img_inputs)

            logits = model(input_ids.to(device), attention_mask.to(device), img_features)
            loss = criterion(logits, labels.to(device))

            total_val_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            val_predictions.extend(preds.cpu().numpy())
            val_labels.extend(labels.numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)

    print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal.pt")
        print("✅ Saved new best model.")
    else:
        patience_counter += 1
        print(f"⏰ Patience {patience_counter}/{patience}")
        if patience_counter >= patience:
            print(f"🛑 Early stopping at epoch {epoch+1}")
            break

# ===================================================
# ✅ 🔟 TEST FINAL
# ===================================================
print("\n🔍 Loading best multimodal model...")
model.load_state_dict(torch.load("best_multimodal.pt"))
model.eval()

test_predictions = []
test_labels = []

with torch.no_grad():
    for images, input_ids, attention_mask, labels in tqdm(test_loader, desc="Final Test"):
        img_inputs = clip_processor(images=images, return_tensors="pt").to(device)
        img_features = clip_model.get_image_features(**img_inputs)

        logits = model(input_ids.to(device), attention_mask.to(device), img_features)
        preds = torch.argmax(logits, dim=1)

        test_predictions.extend(preds.cpu().numpy())
        test_labels.extend(labels.numpy())

test_acc = accuracy_score(test_labels, test_predictions)
prec, rec, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
cm = confusion_matrix(test_labels, test_predictions)

print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {prec:.4f}")
print(f"Test Recall: {rec:.4f}")
print(f"Test F1 (weighted): {f1:.4f}")
print(f"Confusion Matrix:\n{cm}")


2025-07-08 07:04:25.146399: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751958265.328518      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751958265.382982      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]


Train Epoch 1:   0%|          | 0/395 [00:00<?, ?it/s][A
Train Epoch 1:   0%|          | 1/395 [00:01<10:28,  1.60s/it][A
Train Epoch 1:   1%|          | 2/395 [00:02<06:19,  1.03it/s][A
Train Epoch 1:   1%|          | 3/395 [00:02<04:49,  1.35it/s][A
Train Epoch 1:   1%|          | 4/395 [00:02<03:55,  1.66it/s][A
Train Epoch 1:   1%|▏         | 5/395 [00:03<03:34,  1.82it/s][A
Train Epoch 1:   2%|▏         | 6/395 [00:03<03:33,  1.82it/s][A
Train Epoch 1:   2%|▏         | 7/395 [00:04<03:31,  1.84it/s][A
Train Epoch 1:   2%|▏         | 8/395 [00:05<03:34,  1.81it/s][A
Train Epoch 1:   2%|▏         | 9/395 [00:05<03:37,  1.77it/s][A
Train Epoch 1:   3%|▎         | 10/395 [00:06<03:37,  1.77it/s][A
Train Epoch 1:   3%|▎         | 11/395 [00:06<03:23,  1.88it/s][A
Train Epoch 1:   3%|▎         | 12/395 [00:07<03:17,  1.94it/s][A
Train Epoch 1:   3%|▎         | 13/395 [00:07<03:05,  2.06it/s][A
Train Epoch 1:   4%|▎         | 14/395 [00:08<02:59,  2.12it/s][A
Train Epoch 

Epoch [1/20] Train Loss: 0.9276 | Val Loss: 0.8398 | Val Acc: 0.6430
✅ Saved new best model.


Train Epoch 2: 100%|██████████| 395/395 [03:02<00:00,  2.17it/s]
Val Epoch 2: 100%|██████████| 57/57 [00:20<00:00,  2.85it/s]


Epoch [2/20] Train Loss: 0.6501 | Val Loss: 0.8317 | Val Acc: 0.6475
✅ Saved new best model.


Train Epoch 3: 100%|██████████| 395/395 [03:01<00:00,  2.17it/s]
Val Epoch 3: 100%|██████████| 57/57 [00:20<00:00,  2.84it/s]


Epoch [3/20] Train Loss: 0.3721 | Val Loss: 1.0666 | Val Acc: 0.7073
⏰ Patience 1/3


Train Epoch 4: 100%|██████████| 395/395 [03:02<00:00,  2.17it/s]
Val Epoch 4: 100%|██████████| 57/57 [00:19<00:00,  2.87it/s]


Epoch [4/20] Train Loss: 0.1851 | Val Loss: 1.3576 | Val Acc: 0.6253
⏰ Patience 2/3


Train Epoch 5: 100%|██████████| 395/395 [03:01<00:00,  2.18it/s]
Val Epoch 5: 100%|██████████| 57/57 [00:19<00:00,  2.87it/s]


Epoch [5/20] Train Loss: 0.0909 | Val Loss: 1.7549 | Val Acc: 0.6519
⏰ Patience 3/3
🛑 Early stopping at epoch 5

🔍 Loading best multimodal model...


Final Test: 100%|██████████| 113/113 [00:46<00:00,  2.41it/s]

Test Accuracy: 0.6907
Test Precision: 0.7070
Test Recall: 0.6907
Test F1 (weighted): 0.6953
Confusion Matrix:
[[303  51  48]
 [ 70 224  59]
 [ 16  35  96]]



