In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoTokenizer, AutoModel, ViTImageProcessor, ViTForImageClassification
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import torch.nn as nn
import re
import string
from transformers import ViTModel


# ================================================
# ✅ 2️⃣ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ CLEAN TEXT
# ================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_ in [train_df, val_df, test_df]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']

# ================================================
# ✅ 5️⃣ LOAD MODELS
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# IndicDistilBERT
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
text_model = AutoModel.from_pretrained(model_name).to(device)

# ViT Processor
vit_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")

# ================================================
# ✅ 6️⃣ DATASET
# ================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, tokenizer, processor, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['Image_path']).convert('RGB')
        image_inputs = self.processor(image, return_tensors="pt")
        pixel_values = image_inputs['pixel_values'].squeeze(0)

        caption = str(row['Captions'])
        encoding = self.tokenizer(
            caption,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        label = torch.tensor(row['label'], dtype=torch.long)

        return pixel_values, input_ids, attention_mask, label

batch_size = 8
train_loader = DataLoader(MultimodalDataset(train_df, tokenizer, vit_processor), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(MultimodalDataset(val_df, tokenizer, vit_processor), batch_size=batch_size)
test_loader = DataLoader(MultimodalDataset(test_df, tokenizer, vit_processor), batch_size=batch_size)

# ================================================
# ✅ 7️⃣ MULTIMODAL MODEL
# ================================================
class MultimodalClassifier(nn.Module):
    def __init__(self, text_model, hidden_size=256, num_classes=3):
        super().__init__()
        self.text_model = text_model
        self.vision_model = ViTModel.from_pretrained("google/vit-base-patch16-224")

        text_hidden = self.text_model.config.hidden_size
        vision_hidden = self.vision_model.config.hidden_size

        self.fusion = nn.Sequential(
            nn.Linear(text_hidden + vision_hidden, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, pixel_values, input_ids, attention_mask):
        text_out = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_embed = text_out.last_hidden_state[:, 0, :]

        vision_out = self.vision_model(pixel_values=pixel_values)
        vision_embed = vision_out.pooler_output  # ✅ Now it's valid!

        fused = torch.cat((text_embed, vision_embed), dim=1)
        logits = self.fusion(fused)
        return logits


model = MultimodalClassifier(text_model).to(device)

# ================================================
# ✅ 8️⃣ LOSS & OPTIMIZER
# ================================================
class_weights = train_df['label'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ================================================
# ✅ 9️⃣ TRAINING LOOP
# ================================================
num_epochs = 20
patience = 3
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    train_preds, train_labels_list = [], []

    for pixel_values, input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        pixel_values = pixel_values.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(pixel_values, input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_labels_list.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_acc = accuracy_score(train_labels_list, train_preds)

    model.eval()
    total_val_loss = 0
    val_preds, val_labels_list = [], []

    with torch.no_grad():
        for pixel_values, input_ids, attention_mask, labels in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            pixel_values = pixel_values.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            logits = model(pixel_values, input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_val_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels_list.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_acc = accuracy_score(val_labels_list, val_preds)

    print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal.pt")
        print("✅ Model saved.")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"🛑 Early stopping at epoch {epoch+1}")
            break

# ================================================
# ✅ 🔟 FINAL TEST
# ================================================
model.load_state_dict(torch.load("best_multimodal.pt"))
model.eval()
test_preds, test_labels_list = [], []
total_test_loss = 0

with torch.no_grad():
    for pixel_values, input_ids, attention_mask, labels in tqdm(test_loader, desc="Test"):
        pixel_values = pixel_values.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        logits = model(pixel_values, input_ids, attention_mask)
        loss = criterion(logits, labels)
        total_test_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_labels_list.extend(labels.cpu().numpy())

test_acc = accuracy_score(test_labels_list, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels_list, test_preds, average='weighted')
cm = confusion_matrix(test_labels_list, test_preds)

print("\n📊 FINAL TEST RESULTS")
print(f"Accuracy: {test_acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")
print(f"Confusion Matrix:\n{cm}")


2025-07-08 07:14:19.947467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751958860.136228      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751958860.194248      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Train Epoch 1: 100%|██████████| 395/395 [03:49<00:00,  1.72it/s]
Val Epoch 1: 100%|██████████| 57/57 [00:23<00:00,  2.43it/s]


Epoch [1/20] Train Loss: 0.9329 | Train Acc: 0.5748 | Val Loss: 0.8563 | Val Acc: 0.6297
✅ Model saved.


Train Epoch 2: 100%|██████████| 395/395 [03:15<00:00,  2.02it/s]
Val Epoch 2: 100%|██████████| 57/57 [00:18<00:00,  3.06it/s]


Epoch [2/20] Train Loss: 0.7298 | Train Acc: 0.6949 | Val Loss: 0.8616 | Val Acc: 0.6341


Train Epoch 3: 100%|██████████| 395/395 [03:15<00:00,  2.02it/s]
Val Epoch 3: 100%|██████████| 57/57 [00:18<00:00,  3.08it/s]


Epoch [3/20] Train Loss: 0.4866 | Train Acc: 0.8086 | Val Loss: 0.9553 | Val Acc: 0.6608


Train Epoch 4: 100%|██████████| 395/395 [03:16<00:00,  2.01it/s]
Val Epoch 4: 100%|██████████| 57/57 [00:18<00:00,  3.03it/s]


Epoch [4/20] Train Loss: 0.3120 | Train Acc: 0.8748 | Val Loss: 1.1896 | Val Acc: 0.6519
🛑 Early stopping at epoch 4


Test: 100%|██████████| 113/113 [00:44<00:00,  2.54it/s]


📊 FINAL TEST RESULTS
Accuracy: 0.6452 | Precision: 0.6749 | Recall: 0.6452 | F1: 0.6504
Confusion Matrix:
[[244 118  40]
 [ 46 249  58]
 [ 10  48  89]]



