In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ TEXT CLEANING
# ================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_cleaned.csv', index=False)

# ================================================
# ✅ 5️⃣ LOAD MODELS
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

bert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")
bert_model = AutoModel.from_pretrained("sagorsarker/bangla-bert-base").to(device)

# ================================================
# ✅ 6️⃣ MULTIMODAL DATASET
# ================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, clip_processor, bert_tokenizer, max_length=128):
        self.df = df
        self.clip_processor = clip_processor
        self.bert_tokenizer = bert_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['Image_path']).convert("RGB")
        caption = row['Captions']
        label = row['label']

        image_inputs = self.clip_processor(images=image, return_tensors="pt")
        text_inputs = self.bert_tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'pixel_values': image_inputs['pixel_values'].squeeze(0),
            'input_ids': text_inputs['input_ids'].squeeze(0),
            'attention_mask': text_inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# ================================================
# ✅ 7️⃣ DATALOADERS
# ================================================
batch_size = 8

train_dataset = MultimodalDataset(train_df, clip_processor, bert_tokenizer)
val_dataset = MultimodalDataset(val_df, clip_processor, bert_tokenizer)
test_dataset = MultimodalDataset(test_df, clip_processor, bert_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ================================================
# ✅ 8️⃣ MULTIMODAL CLASSIFICATION MODEL
# ================================================
class MultimodalClassifier(torch.nn.Module):
    def __init__(self, clip_model, bert_model, num_classes=3, dropout_rate=0.3):
        super().__init__()
        self.clip = clip_model.vision_model
        self.bert = bert_model
        self.dropout = torch.nn.Dropout(dropout_rate)

        combined_dim = self.clip.config.hidden_size + self.bert.config.hidden_size
        self.classifier = torch.nn.Linear(combined_dim, num_classes)

    def forward(self, pixel_values, input_ids, attention_mask):
        image_outputs = self.clip(pixel_values=pixel_values).pooler_output
        text_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        fused = torch.cat((image_outputs, text_outputs), dim=1)
        fused = self.dropout(fused)
        logits = self.classifier(fused)
        return logits

# ================================================
# ✅ 9️⃣ INITIALIZE MODEL
# ================================================
model = MultimodalClassifier(clip_model, bert_model).to(device)

# ================================================
# ✅ 🔟 LOSS & OPTIMIZER
# ================================================
class_weights = train_df['label'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ================================================
# ✅ 1️⃣1️⃣ TRAINING LOOP
# ================================================
num_epochs = 10
patience = 3
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    train_predictions = []
    train_labels = []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        logits = model(pixel_values, input_ids, attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        predictions = torch.argmax(logits, dim=1)
        train_predictions.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_predictions)

    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(pixel_values, input_ids, attention_mask)
            loss = criterion(logits, labels)

            total_val_loss += loss.item()

            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)

    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal_model.pt")
        print("✅ Validation loss improved — model saved.")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"🛑 Early stopping at epoch {epoch+1}")
            break

# ================================================
# ✅ 1️⃣2️⃣ FINAL TEST EVALUATION
# ================================================
model.load_state_dict(torch.load("best_multimodal_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(pixel_values, input_ids, attention_mask)
        loss = criterion(logits, labels)

        total_test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
cm = confusion_matrix(test_labels, test_predictions)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-Score: {f1:.4f}")
print("Confusion Matrix:")
print(cm)


2025-07-08 05:24:13.449104: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751952253.621946      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751952253.674794      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Train Epoch 1: 100%|██████████| 395/395 [03:31<00:00,  1.87it/s]
Validation Epoch 1: 100%|██████████| 57/57 [00:23<00:00,  2.43it/s]


Epoch [1/10]
Train Loss: 0.9213 | Train Acc: 0.5998
Val Loss: 0.8274 | Val Acc: 0.6741
✅ Validation loss improved — model saved.


Train Epoch 2: 100%|██████████| 395/395 [03:03<00:00,  2.15it/s]
Validation Epoch 2: 100%|██████████| 57/57 [00:19<00:00,  2.91it/s]


Epoch [2/10]
Train Loss: 0.6473 | Train Acc: 0.7354
Val Loss: 0.8491 | Val Acc: 0.6918


Train Epoch 3: 100%|██████████| 395/395 [03:03<00:00,  2.15it/s]
Validation Epoch 3: 100%|██████████| 57/57 [00:19<00:00,  2.88it/s]


Epoch [3/10]
Train Loss: 0.3598 | Train Acc: 0.8609
Val Loss: 0.9869 | Val Acc: 0.6851


Train Epoch 4: 100%|██████████| 395/395 [03:05<00:00,  2.13it/s]
Validation Epoch 4: 100%|██████████| 57/57 [00:19<00:00,  2.89it/s]


Epoch [4/10]
Train Loss: 0.1739 | Train Acc: 0.9344
Val Loss: 1.3024 | Val Acc: 0.6940
🛑 Early stopping at epoch 4


Final Test Evaluation: 100%|██████████| 113/113 [00:43<00:00,  2.59it/s]

Test Accuracy: 0.6896
Test Precision: 0.7085
Test Recall: 0.6896
Test F1-Score: 0.6958
Confusion Matrix:
[[287  68  47]
 [ 51 244  58]
 [ 15  41  91]]



