In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torchvision import models, transforms
from PIL import Image
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & CLEAN CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return " ".join(text.split())

processed_df['Captions'] = processed_df['Captions'].astype(str).apply(clean_text)

train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

# ================================================
# ✅ 4️⃣ DEVICE & TRANSFORMS
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
])

# ================================================
# ✅ 5️⃣ TOKENIZER & BERT
# ================================================
model_name = "ai4bharat/indic-bert"  # Replace if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)

# ================================================
# ✅ 6️⃣ DATASET
# ================================================
class MultimodalDataset(Dataset):
    def __init__(self, df, tokenizer, transform, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Image
        image = Image.open(row['Image_path']).convert('RGB')
        image = self.transform(image)

        # Text
        encoding = self.tokenizer(
            row['Captions'],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        label = torch.tensor(row['Label_Sentiment'], dtype=torch.long)

        return {
            'image': image,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': label
        }

# ================================================
# ✅ 7️⃣ LOADERS
# ================================================
batch_size = 16

train_dataset = MultimodalDataset(train_df, tokenizer, train_transform)
val_dataset = MultimodalDataset(val_df, tokenizer, val_transform)
test_dataset = MultimodalDataset(test_df, tokenizer, val_transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ================================================
# ✅ 8️⃣ MULTIMODAL FUSION MODEL
# ================================================
class MultimodalSentimentClassifier(nn.Module):
    def __init__(self, bert_model, num_classes=3):
        super().__init__()
        self.bert = bert_model
        self.cnn = models.densenet161(pretrained=True)
        num_ftrs = self.cnn.classifier.in_features
        self.cnn.classifier = nn.Identity()  # remove head

        self.text_dropout = nn.Dropout(0.3)
        self.image_dropout = nn.Dropout(0.5)

        fusion_dim = self.bert.config.hidden_size + num_ftrs
        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, input_ids, attention_mask, images):
        # Text
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = bert_output.last_hidden_state[:, 0, :]  # CLS token
        text_features = self.text_dropout(text_features)

        # Image
        image_features = self.cnn(images)
        image_features = self.image_dropout(image_features)

        # Fuse
        combined = torch.cat((text_features, image_features), dim=1)
        logits = self.classifier(combined)
        return logits

model = MultimodalSentimentClassifier(bert_model).to(device)

# ================================================
# ✅ 9️⃣ LOSS & OPTIMIZER
# ================================================
class_weights = train_df['Label_Sentiment'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))

optimizer = AdamW(model.parameters(), lr=2e-5)

# ================================================
# ✅ 🔟 TRAINING LOOP
# ================================================
num_epochs = 20
patience = 3
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    train_preds, train_labels = [], []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        images = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_acc = accuracy_score(train_labels, train_preds)

    model.eval()
    total_val_loss = 0
    val_preds, val_labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            images = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask, images)
            loss = criterion(logits, labels)

            total_val_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_acc = accuracy_score(val_labels, val_preds)

    print(f"Epoch {epoch+1}: Train Loss {avg_train_loss:.4f}, Val Loss {avg_val_loss:.4f}")
    print(f"Train Acc {train_acc:.4f}, Val Acc {val_acc:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_multimodal.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping")
            break

# ================================================
# ✅ 1️⃣1️⃣ TEST EVALUATION
# ================================================
model.load_state_dict(torch.load("best_multimodal.pt"))
model.eval()

test_preds, test_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        images = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask, images)
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

acc = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')
cm = confusion_matrix(test_labels, test_preds)

print(f"Test Acc: {acc:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1: {f1:.4f}")
print(f"Confusion Matrix:\n{cm}")


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

2025-07-08 06:37:13.968814: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751956634.132073      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751956634.180295      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/densenet161-8d451a50.pth" to /root/.cache/torch/hub/checkpoints/densenet161-8d451a50.pth

  0%|          | 0.00/110M [00:00<?, ?B/s][A
  7%|▋         | 8.12M/110M [00:00<00:01, 83.1MB/s][A
 15%|█▌        | 16.6M/110M [00:00<00:01, 86.1MB/s][A
 23%|██▎       | 24.9M/110M [00:00<00:01, 82.8MB/s][A
 30%|██▉       | 32.9M/110M [00:00<00:00, 82.8MB/s][A
 37%|███▋      | 41.0M/110M [00:00<00:00, 83.7MB/s][A
 44%|████▍     | 49.0M/110M [00:00<00:00, 83.4MB/s][A
 52%|█████▏    | 57.6M/110M [00:00<00:00, 85.4MB/s][A
 60%|██████    | 66.6M/110M [00:00<00:00, 87.9MB/s][A
 68%|██████▊   | 75.1M/110M [00:00<00:00, 88.2MB/s][A
 76%|███████▌  | 83.6M/110M [00:01<00:00, 88.3MB/s][A
 84%|████████▎ | 92.4M/110M [00:01<00:00, 88.7MB/s][A
 91%|█████████▏| 101M/110M [00:01<00:00, 87.6MB/s] [A
100%|██████████| 110M/110M [00:01<00:00, 86.8MB/s]
Train Epoch 1: 100%|██████████| 198/198 [02:46<00:00,  1.19it/s]
Validation Epoch 1: 100%|██████████| 2

Epoch 1: Train Loss 1.0179, Val Loss 0.9019
Train Acc 0.5114, Val Acc 0.6009


Train Epoch 2: 100%|██████████| 198/198 [02:27<00:00,  1.34it/s]
Validation Epoch 2: 100%|██████████| 29/29 [00:14<00:00,  2.03it/s]


Epoch 2: Train Loss 0.8758, Val Loss 0.8664
Train Acc 0.6011, Val Acc 0.6408


Train Epoch 3: 100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
Validation Epoch 3: 100%|██████████| 29/29 [00:14<00:00,  2.04it/s]


Epoch 3: Train Loss 0.7800, Val Loss 0.9024
Train Acc 0.6518, Val Acc 0.6164


Train Epoch 4: 100%|██████████| 198/198 [02:28<00:00,  1.34it/s]
Validation Epoch 4: 100%|██████████| 29/29 [00:16<00:00,  1.72it/s]


Epoch 4: Train Loss 0.6927, Val Loss 0.9059
Train Acc 0.6939, Val Acc 0.6120


Train Epoch 5: 100%|██████████| 198/198 [02:48<00:00,  1.17it/s]
Validation Epoch 5: 100%|██████████| 29/29 [00:17<00:00,  1.70it/s]


Epoch 5: Train Loss 0.5637, Val Loss 1.0430
Train Acc 0.7573, Val Acc 0.6364
Early stopping


Testing: 100%|██████████| 57/57 [00:54<00:00,  1.04it/s]

Test Acc: 0.6497
Test Precision: 0.6755
Test Recall: 0.6497
Test F1: 0.6515
Confusion Matrix:
[[241 134  27]
 [ 49 265  39]
 [  9  58  80]]



