In [1]:
# ================================================
# ✅ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string

# ================================================
# ✅ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# Clean text
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

processed_df['Captions'] = processed_df['Captions'].astype(str).apply(clean_text)

# ================================================
# ✅ DATA SPLIT
# ================================================
train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

# ================================================
# ✅ DEVICE & TRANSFORMS
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# ================================================
# ✅ TOKENIZER
# ================================================
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ================================================
# ✅ MULTIMODAL DATASET
# ================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, tokenizer, image_transform, max_length=128):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.transform = image_transform
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['Image_path']).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        text = str(row['Captions'])
        encoding = self.tokenizer(
            text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt'
        )
        
        label = int(row['Label_Sentiment'])
        
        return {
            'image': image,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# ================================================
# ✅ DATALOADERS
# ================================================
batch_size = 16

train_dataset = MultimodalDataset(train_df, tokenizer, train_transform)
val_dataset = MultimodalDataset(val_df, tokenizer, val_test_transform)
test_dataset = MultimodalDataset(test_df, tokenizer, val_test_transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ================================================
# ✅ MULTIMODAL MODEL
# ================================================
class MultimodalClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        # Vision branch
        self.vision_model = models.densenet161(pretrained=True)
        num_ftrs = self.vision_model.classifier.in_features
        self.vision_model.classifier = nn.Identity()  # Remove final classifier
        
        # Text branch
        self.text_model = AutoModel.from_pretrained(model_name)
        
        # Combined classifier
        combined_size = num_ftrs + self.text_model.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(combined_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, image, input_ids, attention_mask):
        img_features = self.vision_model(image)
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0, :]  # CLS token
        combined = torch.cat((img_features, text_features), dim=1)
        logits = self.classifier(combined)
        return logits

# ================================================
# ✅ INIT MODEL
# ================================================
model = MultimodalClassifier().to(device)

# ================================================
# ✅ LOSS, OPTIMIZER
# ================================================
class_counts = train_df['Label_Sentiment'].value_counts().sort_index()
total = len(train_df)
weights = [total / c for c in class_counts]
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ================================================
# ✅ TRAINING LOOP
# ================================================
num_epochs = 20
patience = 3
patience_counter = 0
best_val_loss = float('inf')

print("Starting training...")

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_preds, train_labels = [], []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        image = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(image, input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = train_loss / len(train_loader)
    train_acc = accuracy_score(train_labels, train_preds)

    model.eval()
    val_loss = 0
    val_preds, val_labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            image = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(image, input_ids, attention_mask)
            loss = criterion(logits, labels)

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_acc = accuracy_score(val_labels, val_preds)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal_model.pt")
        print("✅ Saved best model.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("🛑 Early stopping.")
            break

# ================================================
# ✅ FINAL TEST EVAL
# ================================================
model.load_state_dict(torch.load("best_multimodal_model.pt"))
model.eval()
test_preds, test_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        image = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(image, input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

acc = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')
cm = confusion_matrix(test_labels, test_preds)

print("\n✅ FINAL TEST RESULTS:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 (weighted): {f1:.4f}")
print(f"Confusion Matrix:\n{cm}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/densenet161-8d451a50.pth" to /root/.cache/torch/hub/checkpoints/densenet161-8d451a50.pth
100%|██████████| 110M/110M [00:00<00:00, 225MB/s]
2025-07-08 06:25:21.612301: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751955921.836289      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751955921.901881      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Starting training...


Train Epoch 1: 100%|██████████| 198/198 [02:56<00:00,  1.12it/s]
Val Epoch 1: 100%|██████████| 29/29 [00:16<00:00,  1.77it/s]


Epoch [1/20] - Train Loss: 1.0169, Val Loss: 0.9264, Train Acc: 0.5044, Val Acc: 0.6098
✅ Saved best model.


Train Epoch 2: 100%|██████████| 198/198 [02:38<00:00,  1.25it/s]
Val Epoch 2: 100%|██████████| 29/29 [00:14<00:00,  2.06it/s]


Epoch [2/20] - Train Loss: 0.8915, Val Loss: 0.9070, Train Acc: 0.5875, Val Acc: 0.6142
✅ Saved best model.


Train Epoch 3: 100%|██████████| 198/198 [02:37<00:00,  1.26it/s]
Val Epoch 3: 100%|██████████| 29/29 [00:14<00:00,  2.05it/s]


Epoch [3/20] - Train Loss: 0.8284, Val Loss: 0.9051, Train Acc: 0.6134, Val Acc: 0.6009
✅ Saved best model.


Train Epoch 4: 100%|██████████| 198/198 [02:38<00:00,  1.25it/s]
Val Epoch 4: 100%|██████████| 29/29 [00:14<00:00,  2.05it/s]


Epoch [4/20] - Train Loss: 0.7748, Val Loss: 0.8896, Train Acc: 0.6511, Val Acc: 0.6231
✅ Saved best model.


Train Epoch 5: 100%|██████████| 198/198 [02:38<00:00,  1.25it/s]
Val Epoch 5: 100%|██████████| 29/29 [00:14<00:00,  2.05it/s]


Epoch [5/20] - Train Loss: 0.7121, Val Loss: 0.9396, Train Acc: 0.6755, Val Acc: 0.6364
⏰ No improvement — patience 1/3


Train Epoch 6: 100%|██████████| 198/198 [02:38<00:00,  1.25it/s]
Val Epoch 6: 100%|██████████| 29/29 [00:14<00:00,  2.05it/s]


Epoch [6/20] - Train Loss: 0.6471, Val Loss: 0.9947, Train Acc: 0.7113, Val Acc: 0.6297
⏰ No improvement — patience 2/3


Train Epoch 7: 100%|██████████| 198/198 [02:38<00:00,  1.25it/s]
Val Epoch 7: 100%|██████████| 29/29 [00:14<00:00,  2.06it/s]


Epoch [7/20] - Train Loss: 0.5718, Val Loss: 1.0721, Train Acc: 0.7481, Val Acc: 0.6231
⏰ No improvement — patience 3/3
🛑 Early stopping.


Testing: 100%|██████████| 57/57 [00:32<00:00,  1.78it/s]


✅ FINAL TEST RESULTS:
Accuracy: 0.6375
Precision: 0.6957
Recall: 0.6375
F1 (weighted): 0.6473
Confusion Matrix:
[[232 118  52]
 [ 26 248  79]
 [  6  46  95]]



