In [2]:
# ✅ 1️⃣ LIBRARIES & SETUP
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoTokenizer, AutoModel, ViTImageProcessor, ViTModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import torch.nn as nn
import re
import string

# ✅ 2️⃣ PATHS
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ✅ 3️⃣ LOAD & PREPROCESS CSV
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path) and pd.notna(row['extracted_text']) and row['extracted_text'].strip():
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ✅ 4️⃣ DATA SPLITTING
train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

# ✅ 5️⃣ TOKENIZER & PROCESSOR
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
vit_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")

# ✅ 6️⃣ DATASET CLASS
class MultimodalDataset(Dataset):
    def __init__(self, df, tokenizer, processor, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['Image_path']).convert('RGB')
        text = row['Captions']
        label = row['Label_Sentiment']

        encoded = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        image_inputs = self.processor(image, return_tensors="pt")

        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'pixel_values': image_inputs['pixel_values'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# ✅ 7️⃣ DATALOADERS
batch_size = 8

train_dataset = MultimodalDataset(train_df, tokenizer, vit_processor)
val_dataset = MultimodalDataset(val_df, tokenizer, vit_processor)
test_dataset = MultimodalDataset(test_df, tokenizer, vit_processor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ✅ 8️⃣ MULTIMODAL MODEL
class MultimodalClassifier(nn.Module):
    def __init__(self, text_model_name, vision_model_name, num_classes=3):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.vision_encoder = ViTModel.from_pretrained(vision_model_name)

        combined_dim = self.text_encoder.config.hidden_size + self.vision_encoder.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(combined_dim, num_classes)
        )

    def forward(self, input_ids, attention_mask, pixel_values):
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_cls = text_outputs.last_hidden_state[:, 0, :]

        vision_outputs = self.vision_encoder(pixel_values=pixel_values)
        vision_cls = vision_outputs.last_hidden_state[:, 0, :]

        combined = torch.cat((text_cls, vision_cls), dim=1)
        logits = self.classifier(combined)
        return logits

model = MultimodalClassifier("google/muril-base-cased", "google/vit-base-patch16-224").to(device)

# ✅ 9️⃣ LOSS & OPTIMIZER
class_weights = train_df['Label_Sentiment'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ✅ 🔟 TRAINING LOOP
num_epochs = 20
patience = 3
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, pixel_values)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask, pixel_values)
            loss = criterion(logits, labels)

            total_val_loss += loss.item()
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal_model.pt")
        print("✅ Validation improved, model saved.")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break

# ✅ 1️⃣1️⃣ TEST EVALUATION
print("\n🔍 Loading best model for final test evaluation...")
model.load_state_dict(torch.load("best_multimodal_model.pt"))
model.eval()

test_predictions = []
test_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask, pixel_values)
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

acc = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
cm = confusion_matrix(test_labels, test_predictions)

print(f"Test Accuracy: {acc:.4f}")
print(f"Test Precision (Weighted): {precision:.4f}")
print(f"Test Recall (Weighted): {recall:.4f}")
print(f"Test F1-Score (Weighted): {f1:.4f}")
print(f"Confusion Matrix:\n{cm}")


Using device: cuda


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Train Epoch 1: 100%|██████████| 395/395 [03:36<00:00,  1.82it/s]
Validation Epoch 1: 100%|██████████| 57/57 [00:19<00:00,  2.96it/s]


Epoch [1/20] Train Loss: 0.9282 | Val Loss: 0.8530 | Val Acc: 0.6186
✅ Validation improved, model saved.


Train Epoch 2: 100%|██████████| 395/395 [03:36<00:00,  1.83it/s]
Validation Epoch 2: 100%|██████████| 57/57 [00:19<00:00,  2.98it/s]


Epoch [2/20] Train Loss: 0.6395 | Val Loss: 0.8225 | Val Acc: 0.6696
✅ Validation improved, model saved.


Train Epoch 3: 100%|██████████| 395/395 [03:37<00:00,  1.81it/s]
Validation Epoch 3: 100%|██████████| 57/57 [00:19<00:00,  2.97it/s]


Epoch [3/20] Train Loss: 0.4054 | Val Loss: 0.8342 | Val Acc: 0.6763


Train Epoch 4: 100%|██████████| 395/395 [03:35<00:00,  1.83it/s]
Validation Epoch 4: 100%|██████████| 57/57 [00:19<00:00,  2.92it/s]


Epoch [4/20] Train Loss: 0.2520 | Val Loss: 0.8964 | Val Acc: 0.7118


Train Epoch 5: 100%|██████████| 395/395 [03:37<00:00,  1.82it/s]
Validation Epoch 5: 100%|██████████| 57/57 [00:18<00:00,  3.02it/s]


Epoch [5/20] Train Loss: 0.1765 | Val Loss: 0.9803 | Val Acc: 0.7095
🛑 Early stopping triggered at epoch 5

🔍 Loading best model for final test evaluation...


Final Test Evaluation: 100%|██████████| 113/113 [00:46<00:00,  2.46it/s]

Test Accuracy: 0.6929
Test Precision (Weighted): 0.7140
Test Recall (Weighted): 0.6929
Test F1-Score (Weighted): 0.6984
Confusion Matrix:
[[276  85  41]
 [ 45 254  54]
 [ 10  42  95]]



