In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    # Only check if text exists (no image path checking needed)
    if pd.notna(row['extracted_text']) and row['extracted_text'].strip():
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ TEXT CLEANING
# ================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_cleaned.csv', index=False)

# ================================================
# ✅ 5️⃣ LOAD MuRIL MODEL
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load MuRIL tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
muril_model = AutoModel.from_pretrained("google/muril-base-cased").to(device)

# ================================================
# ✅ 6️⃣ TEXT-ONLY DATASET
# ================================================
class TextOnlyDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['Captions']
        label = row['label']
        
        # Tokenize text
        encoded = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoded['input_ids'].flatten(),
            'attention_mask': encoded['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# ================================================
# ✅ 7️⃣ DATALOADERS
# ================================================
batch_size = 16

train_dataset = TextOnlyDataset(train_df, tokenizer)
val_dataset = TextOnlyDataset(val_df, tokenizer)
test_dataset = TextOnlyDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# ================================================
# ✅ 8️⃣ TEXT CLASSIFICATION MODEL
# ================================================
class MuRILClassifier(torch.nn.Module):
    def __init__(self, muril_model, num_classes=3, dropout=0.3):
        super().__init__()
        self.muril = muril_model
        self.dropout = torch.nn.Dropout(dropout)
        self.classifier = torch.nn.Linear(muril_model.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.muril(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits

# ================================================
# ✅ 9️⃣ INITIALIZE MODEL
# ================================================
model = MuRILClassifier(muril_model, num_classes=3).to(device)

# ================================================
# ✅ 🔟 LOSS & OPTIMIZER
# ================================================
# Calculate class weights for imbalanced dataset
class_weights = train_df['label'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

print(f"Class distribution: {class_weights}")
print(f"Class weights: {weights}")

# ================================================
# ✅ 1️⃣1️⃣ TRAINING LOOP
# ================================================
num_epochs = 20
patience = 5
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    # ============================================================
    # TRAINING PHASE
    # ============================================================
    model.train()
    total_train_loss = 0
    train_predictions = []
    train_labels = []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        
        # Store predictions for metrics
        predictions = torch.argmax(logits, dim=1)
        train_predictions.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_predictions)

    # ============================================================
    # VALIDATION PHASE
    # ============================================================
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            total_val_loss += loss.item()
            
            # Store predictions for metrics
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    # ============================================================
    # EARLY STOPPING CHECK
    # ============================================================
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_muril_model.pt")
        print("✅ Validation loss improved — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break
    
    print("-" * 50)

# ================================================
# ✅ 1️⃣2️⃣ FINAL TEST EVALUATION
# ================================================
print("\n🔍 Loading best model for final evaluation...")
model.load_state_dict(torch.load("best_muril_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        total_test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Calculate final metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
cm = confusion_matrix(test_labels, test_predictions)

print("\n📊 FINAL TEST RESULTS (TEXT-ONLY WITH MuRIL):")
print("=" * 60)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision (Weighted): {precision:.4f}")
print(f"Test Recall (Weighted): {recall:.4f}")
print(f"Test F1-Score (Weighted): {f1:.4f}")
print(f"Test Loss: {total_test_loss/len(test_loader):.4f}")
print(f"\nConfusion Matrix:\n{cm}")

# ================================================
# ✅ 1️⃣3️⃣ DETAILED METRICS BY CLASS
# ================================================
precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
    test_labels, test_predictions, average=None
)

print("\n📋 PER-CLASS METRICS:")
print("=" * 40)
class_names = ['Class 0', 'Class 1', 'Class 2']
for i, class_name in enumerate(class_names):
    print(f"{class_name}:")
    print(f"  Precision: {precision_per_class[i]:.4f}")
    print(f"  Recall: {recall_per_class[i]:.4f}")
    print(f"  F1-Score: {f1_per_class[i]:.4f}")
    print(f"  Support: {support[i]}")
    print()

print(f"Total samples: {len(test_labels)}")
print(f"Dataset size - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

2025-07-07 07:44:29.913573: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751874270.099258      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751874270.154104      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Class distribution: [1404, 1237, 515]
Class weights: [2.247863247863248, 2.551333872271625, 6.128155339805825]



Train Epoch 1:   0%|          | 0/198 [00:00<?, ?it/s][A
Train Epoch 1:   1%|          | 1/198 [00:00<02:20,  1.40it/s][A
Train Epoch 1:   1%|          | 2/198 [00:00<01:24,  2.32it/s][A
Train Epoch 1:   2%|▏         | 3/198 [00:01<01:05,  2.96it/s][A
Train Epoch 1:   2%|▏         | 4/198 [00:01<00:57,  3.39it/s][A
Train Epoch 1:   3%|▎         | 5/198 [00:01<00:52,  3.67it/s][A
Train Epoch 1:   3%|▎         | 6/198 [00:01<00:49,  3.88it/s][A
Train Epoch 1:   4%|▎         | 7/198 [00:02<00:47,  4.02it/s][A
Train Epoch 1:   4%|▍         | 8/198 [00:02<00:46,  4.12it/s][A
Train Epoch 1:   5%|▍         | 9/198 [00:02<00:45,  4.18it/s][A
Train Epoch 1:   5%|▌         | 10/198 [00:02<00:44,  4.23it/s][A
Train Epoch 1:   6%|▌         | 11/198 [00:03<00:43,  4.26it/s][A
Train Epoch 1:   6%|▌         | 12/198 [00:03<00:45,  4.09it/s][A
Train Epoch 1:   7%|▋         | 13/198 [00:03<00:49,  3.77it/s][A
Train Epoch 1:   7%|▋         | 14/198 [00:03<00:49,  3.74it/s][A
Train Epoch 

Epoch [1/20]
Train Loss: 1.0833 | Train Acc: 0.4091
Val Loss: 1.0657 | Val Acc: 0.4789
✅ Validation loss improved — model saved.
--------------------------------------------------



Train Epoch 2:   0%|          | 0/198 [00:00<?, ?it/s][A
Train Epoch 2:   1%|          | 1/198 [00:00<00:48,  4.04it/s][A
Train Epoch 2:   1%|          | 2/198 [00:00<00:48,  4.07it/s][A
Train Epoch 2:   2%|▏         | 3/198 [00:00<00:49,  3.94it/s][A
Train Epoch 2:   2%|▏         | 4/198 [00:00<00:48,  4.02it/s][A
Train Epoch 2:   3%|▎         | 5/198 [00:01<00:47,  4.07it/s][A
Train Epoch 2:   3%|▎         | 6/198 [00:01<00:46,  4.11it/s][A
Train Epoch 2:   4%|▎         | 7/198 [00:01<00:46,  4.12it/s][A
Train Epoch 2:   4%|▍         | 8/198 [00:01<00:46,  4.12it/s][A
Train Epoch 2:   5%|▍         | 9/198 [00:02<00:45,  4.19it/s][A
Train Epoch 2:   5%|▌         | 10/198 [00:02<00:44,  4.18it/s][A
Train Epoch 2:   6%|▌         | 11/198 [00:02<00:45,  4.07it/s][A
Train Epoch 2:   6%|▌         | 12/198 [00:02<00:45,  4.11it/s][A
Train Epoch 2:   7%|▋         | 13/198 [00:03<00:44,  4.14it/s][A
Train Epoch 2:   7%|▋         | 14/198 [00:03<00:44,  4.16it/s][A
Train Epoch 

Epoch [2/20]
Train Loss: 1.0671 | Train Acc: 0.4807
Val Loss: 1.0528 | Val Acc: 0.4767
✅ Validation loss improved — model saved.
--------------------------------------------------


Train Epoch 3: 100%|██████████| 198/198 [00:44<00:00,  4.41it/s]
Validation Epoch 3: 100%|██████████| 29/29 [00:01<00:00, 17.22it/s]


Epoch [3/20]
Train Loss: 1.0295 | Train Acc: 0.4924
Val Loss: 1.0490 | Val Acc: 0.4745
✅ Validation loss improved — model saved.
--------------------------------------------------


Train Epoch 4: 100%|██████████| 198/198 [00:44<00:00,  4.40it/s]
Validation Epoch 4: 100%|██████████| 29/29 [00:01<00:00, 17.25it/s]


Epoch [4/20]
Train Loss: 1.0218 | Train Acc: 0.4952
Val Loss: 1.0075 | Val Acc: 0.4900
✅ Validation loss improved — model saved.
--------------------------------------------------


Train Epoch 5: 100%|██████████| 198/198 [00:44<00:00,  4.41it/s]
Validation Epoch 5: 100%|██████████| 29/29 [00:01<00:00, 17.27it/s]


Epoch [5/20]
Train Loss: 1.0256 | Train Acc: 0.4366
Val Loss: 1.0102 | Val Acc: 0.4922
⏰ No improvement — patience 1/5
--------------------------------------------------


Train Epoch 6: 100%|██████████| 198/198 [00:44<00:00,  4.41it/s]
Validation Epoch 6: 100%|██████████| 29/29 [00:01<00:00, 17.28it/s]


Epoch [6/20]
Train Loss: 0.9748 | Train Acc: 0.5215
Val Loss: 0.9837 | Val Acc: 0.5543
✅ Validation loss improved — model saved.
--------------------------------------------------


Train Epoch 7: 100%|██████████| 198/198 [00:44<00:00,  4.41it/s]
Validation Epoch 7: 100%|██████████| 29/29 [00:01<00:00, 17.11it/s]


Epoch [7/20]
Train Loss: 0.9242 | Train Acc: 0.5951
Val Loss: 0.9272 | Val Acc: 0.6275
✅ Validation loss improved — model saved.
--------------------------------------------------


Train Epoch 8: 100%|██████████| 198/198 [00:44<00:00,  4.40it/s]
Validation Epoch 8: 100%|██████████| 29/29 [00:01<00:00, 16.92it/s]


Epoch [8/20]
Train Loss: 0.8316 | Train Acc: 0.7180
Val Loss: 0.8170 | Val Acc: 0.7228
✅ Validation loss improved — model saved.
--------------------------------------------------


Train Epoch 9: 100%|██████████| 198/198 [00:45<00:00,  4.39it/s]
Validation Epoch 9: 100%|██████████| 29/29 [00:01<00:00, 17.10it/s]


Epoch [9/20]
Train Loss: 0.7292 | Train Acc: 0.7877
Val Loss: 0.8092 | Val Acc: 0.7051
✅ Validation loss improved — model saved.
--------------------------------------------------


Train Epoch 10: 100%|██████████| 198/198 [00:45<00:00,  4.39it/s]
Validation Epoch 10: 100%|██████████| 29/29 [00:01<00:00, 17.06it/s]


Epoch [10/20]
Train Loss: 0.6208 | Train Acc: 0.8397
Val Loss: 0.7726 | Val Acc: 0.7118
✅ Validation loss improved — model saved.
--------------------------------------------------


Train Epoch 11: 100%|██████████| 198/198 [00:45<00:00,  4.39it/s]
Validation Epoch 11: 100%|██████████| 29/29 [00:01<00:00, 17.17it/s]


Epoch [11/20]
Train Loss: 0.5388 | Train Acc: 0.8701
Val Loss: 0.8312 | Val Acc: 0.7095
⏰ No improvement — patience 1/5
--------------------------------------------------


Train Epoch 12: 100%|██████████| 198/198 [00:45<00:00,  4.40it/s]
Validation Epoch 12: 100%|██████████| 29/29 [00:01<00:00, 17.18it/s]


Epoch [12/20]
Train Loss: 0.4623 | Train Acc: 0.8939
Val Loss: 0.8141 | Val Acc: 0.7118
⏰ No improvement — patience 2/5
--------------------------------------------------


Train Epoch 13: 100%|██████████| 198/198 [00:45<00:00,  4.40it/s]
Validation Epoch 13: 100%|██████████| 29/29 [00:01<00:00, 17.17it/s]


Epoch [13/20]
Train Loss: 0.3789 | Train Acc: 0.9179
Val Loss: 0.7562 | Val Acc: 0.7384
✅ Validation loss improved — model saved.
--------------------------------------------------


Train Epoch 14: 100%|██████████| 198/198 [00:45<00:00,  4.39it/s]
Validation Epoch 14: 100%|██████████| 29/29 [00:01<00:00, 17.15it/s]


Epoch [14/20]
Train Loss: 0.3180 | Train Acc: 0.9306
Val Loss: 0.8397 | Val Acc: 0.6940
⏰ No improvement — patience 1/5
--------------------------------------------------


Train Epoch 15: 100%|██████████| 198/198 [00:45<00:00,  4.40it/s]
Validation Epoch 15: 100%|██████████| 29/29 [00:01<00:00, 17.02it/s]


Epoch [15/20]
Train Loss: 0.2788 | Train Acc: 0.9414
Val Loss: 0.8670 | Val Acc: 0.7450
⏰ No improvement — patience 2/5
--------------------------------------------------


Train Epoch 16: 100%|██████████| 198/198 [00:45<00:00,  4.39it/s]
Validation Epoch 16: 100%|██████████| 29/29 [00:01<00:00, 17.04it/s]


Epoch [16/20]
Train Loss: 0.2228 | Train Acc: 0.9569
Val Loss: 0.8404 | Val Acc: 0.7361
⏰ No improvement — patience 3/5
--------------------------------------------------


Train Epoch 17: 100%|██████████| 198/198 [00:45<00:00,  4.39it/s]
Validation Epoch 17: 100%|██████████| 29/29 [00:01<00:00, 17.06it/s]


Epoch [17/20]
Train Loss: 0.1906 | Train Acc: 0.9642
Val Loss: 0.9131 | Val Acc: 0.7361
⏰ No improvement — patience 4/5
--------------------------------------------------


Train Epoch 18: 100%|██████████| 198/198 [00:45<00:00,  4.39it/s]
Validation Epoch 18: 100%|██████████| 29/29 [00:01<00:00, 17.03it/s]


Epoch [18/20]
Train Loss: 0.1740 | Train Acc: 0.9693
Val Loss: 0.9282 | Val Acc: 0.7450
⏰ No improvement — patience 5/5
🛑 Early stopping triggered at epoch 18

🔍 Loading best model for final evaluation...


Final Test Evaluation: 100%|██████████| 57/57 [00:03<00:00, 16.68it/s]


📊 FINAL TEST RESULTS (TEXT-ONLY WITH MuRIL):
Test Accuracy: 0.7694
Test Precision (Weighted): 0.7762
Test Recall (Weighted): 0.7694
Test F1-Score (Weighted): 0.7710
Test Loss: 0.7081

Confusion Matrix:
[[311  68  23]
 [ 32 286  35]
 [ 17  33  97]]

📋 PER-CLASS METRICS:
Class 0:
  Precision: 0.8639
  Recall: 0.7736
  F1-Score: 0.8163
  Support: 402

Class 1:
  Precision: 0.7390
  Recall: 0.8102
  F1-Score: 0.7730
  Support: 353

Class 2:
  Precision: 0.6258
  Recall: 0.6599
  F1-Score: 0.6424
  Support: 147

Total samples: 902
Dataset size - Train: 3156, Val: 451, Test: 902



