In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, ViTImageProcessor, ViTModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from PIL import Image
import re
import string
import numpy as np

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ TEXT CLEANING
# ================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_cleaned.csv', index=False)

# ================================================
# ✅ 5️⃣ LOAD MODELS & TOKENIZER
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vit_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224").to(device)

bert_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert_model = AutoModel.from_pretrained(bert_name).to(device)

# ================================================
# ✅ 6️⃣ MULTIMODAL DATASET
# ================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, processor, tokenizer, max_length=128):
        self.df = df
        self.processor = processor
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Image
        image = Image.open(row['Image_path']).convert('RGB')
        image_inputs = self.processor(image, return_tensors="pt")
        pixel_values = image_inputs['pixel_values'].squeeze(0)

        # Text
        text = str(row['Captions'])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        label = int(row['label'])

        return pixel_values, input_ids, attention_mask, label

def collate_fn(batch):
    pixel_values, input_ids, attention_mask, labels = zip(*batch)
    return (
        torch.stack(pixel_values),
        torch.stack(input_ids),
        torch.stack(attention_mask),
        torch.tensor(labels)
    )

batch_size = 8

train_loader = DataLoader(MultimodalDataset(train_df, vit_processor, tokenizer), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(MultimodalDataset(val_df, vit_processor, tokenizer), batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(MultimodalDataset(test_df, vit_processor, tokenizer), batch_size=batch_size, collate_fn=collate_fn)

# ================================================
# ✅ 7️⃣ MULTIMODAL FUSION MODEL
# ================================================
class MultimodalClassifier(torch.nn.Module):
    def __init__(self, vit_model, bert_model, hidden_size=768, num_classes=3):
        super().__init__()
        self.vit = vit_model
        self.bert = bert_model

        self.image_proj = torch.nn.Linear(self.vit.config.hidden_size, hidden_size)
        self.text_proj = torch.nn.Linear(self.bert.config.hidden_size, hidden_size)

        self.classifier = torch.nn.Linear(hidden_size * 2, num_classes)

    def forward(self, pixel_values, input_ids, attention_mask):
        image_outputs = self.vit(pixel_values=pixel_values).pooler_output
        text_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

        image_features = self.image_proj(image_outputs)
        text_features = self.text_proj(text_outputs)

        fused = torch.cat((image_features, text_features), dim=1)
        logits = self.classifier(fused)

        return logits

model = MultimodalClassifier(vit_model, bert_model).to(device)

# ================================================
# ✅ 8️⃣ LOSS & OPTIMIZER
# ================================================
class_counts = train_df['label'].value_counts().sort_index()
total_samples = len(train_df)
class_weights = [total_samples / count for count in class_counts]

criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ================================================
# ✅ 9️⃣ TRAINING LOOP
# ================================================
num_epochs = 20
patience = 3
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    train_preds, train_labels = [], []

    for pixel_values, input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        pixel_values = pixel_values.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        logits = model(pixel_values, input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_acc = accuracy_score(train_labels, train_preds)

    model.eval()
    total_val_loss = 0
    val_preds, val_labels_list = [], []

    with torch.no_grad():
        for pixel_values, input_ids, attention_mask, labels in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            pixel_values = pixel_values.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(pixel_values, input_ids, attention_mask)
            loss = criterion(logits, labels)

            total_val_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels_list.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_acc = accuracy_score(val_labels_list, val_preds)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal_model.pt")
        print("✅ Saved!")
    else:
        patience_counter += 1
        print(f"Patience: {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("🛑 Early stopping.")
            break

# ================================================
# ✅ 🔟 FINAL TEST
# ================================================
model.load_state_dict(torch.load("best_multimodal_model.pt"))
model.eval()

all_preds, all_labels = [], []

with torch.no_grad():
    for pixel_values, input_ids, attention_mask, labels in tqdm(test_loader, desc="Test"):
        pixel_values = pixel_values.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        logits = model(pixel_values, input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

print(f"Final Test: Acc={acc:.4f} Precision={precision:.4f} Recall={recall:.4f} F1={f1:.4f}")


import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(y_true, y_pred, class_names):
    cm = confusion_matrix(y_true, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title('Normalized Confusion Matrix')
    plt.show()


# Example usage:
# plot_confusion_matrix(test_labels, test_predictions, class_names=['Negative', 'Neutral', 'Positive'])



2025-07-08 05:37:49.195007: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751953069.397912      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751953069.455626      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Train Epoch 1: 100%|██████████| 395/395 [04:10<00:00,  1.58it/s]
Val Epoch 1: 100%|██████████| 57/57 [00:23<00:00,  2.44it/s]


Epoch 1/20 | Train Loss: 0.9059 | Train Acc: 0.5925 | Val Loss: 0.8403 | Val Acc: 0.6452
✅ Saved!


Train Epoch 2: 100%|██████████| 395/395 [03:41<00:00,  1.78it/s]
Val Epoch 2: 100%|██████████| 57/57 [00:19<00:00,  2.92it/s]


Epoch 2/20 | Train Loss: 0.6645 | Train Acc: 0.7180 | Val Loss: 0.8953 | Val Acc: 0.5920
Patience: 1/3


Train Epoch 3: 100%|██████████| 395/395 [03:45<00:00,  1.76it/s]
Val Epoch 3: 100%|██████████| 57/57 [00:20<00:00,  2.82it/s]


Epoch 3/20 | Train Loss: 0.3951 | Train Acc: 0.8419 | Val Loss: 1.1167 | Val Acc: 0.6253
Patience: 2/3


Train Epoch 4: 100%|██████████| 395/395 [03:44<00:00,  1.76it/s]
Val Epoch 4: 100%|██████████| 57/57 [00:20<00:00,  2.82it/s]


Epoch 4/20 | Train Loss: 0.2623 | Train Acc: 0.8875 | Val Loss: 1.2451 | Val Acc: 0.6364
Patience: 3/3
🛑 Early stopping.


Test: 100%|██████████| 113/113 [00:45<00:00,  2.46it/s]


Final Test: Acc=0.6508 Precision=0.6557 Recall=0.6508 F1=0.6506
