In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
image_dir = "/kaggle/input/basem/images"
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    image_filename = row['image_path']
    full_image_path = os.path.join(image_dir, image_filename)
    if os.path.exists(full_image_path):
        label_converted = row['label 2'] - 1
        existing_data.append({
            'Image_path': full_image_path,
            'Captions': row['extracted_text'],
            'Label_Sentiment': label_converted
        })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ TEXT CLEANING
# ================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_cleaned.csv', index=False)

# ================================================
# ✅ 5️⃣ LOAD DISTILBERT MODEL
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

# ================================================
# ✅ 6️⃣ TEXT-ONLY DATASET
# ================================================
class TextOnlyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        caption = row['Captions']
        label = row['label']
        return caption, label

def collate_fn(batch):
    texts, labels = zip(*batch)
    labels = torch.tensor(labels)
    inputs = tokenizer(list(texts), padding=True, truncation=True, max_length=128, return_tensors='pt')
    return inputs, labels

# ================================================
# ✅ 7️⃣ DATALOADERS
# ================================================
batch_size = 16

train_loader = DataLoader(TextOnlyDataset(train_df, tokenizer), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(TextOnlyDataset(val_df, tokenizer), batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(TextOnlyDataset(test_df, tokenizer), batch_size=batch_size, collate_fn=collate_fn)

# ================================================
# ✅ 8️⃣ TEXT-ONLY CLASSIFICATION MODEL
# ================================================
class TextOnlyClassifier(torch.nn.Module):
    def __init__(self, text_dim, num_classes=3, dropout_rate=0.3):
        super().__init__()
        self.bert = bert_model
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.classifier = torch.nn.Linear(text_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use [CLS] token representation
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits

# ================================================
# ✅ 9️⃣ INITIALIZE MODEL
# ================================================
text_dim = bert_model.config.hidden_size
model = TextOnlyClassifier(text_dim).to(device)

# ================================================
# ✅ 🔟 LOSS & OPTIMIZER
# ================================================
class_weights = train_df['label'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ================================================
# ✅ 1️⃣1️⃣ TRAINING LOOP
# ================================================
num_epochs = 20
patience = 3
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    # ============================================================
    # TRAINING PHASE
    # ============================================================
    model.train()
    total_train_loss = 0

    for inputs, labels in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        logits = model(
            input_ids=inputs['input_ids'].to(device),
            attention_mask=inputs['attention_mask'].to(device)
        )
        loss = criterion(logits, labels.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # ============================================================
    # VALIDATION PHASE
    # ============================================================
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            logits = model(
                input_ids=inputs['input_ids'].to(device),
                attention_mask=inputs['attention_mask'].to(device)
            )
            loss = criterion(logits, labels.to(device))

            total_val_loss += loss.item()
            
            # Store predictions for metrics
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    
    print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    # ============================================================
    # EARLY STOPPING CHECK
    # ============================================================
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_text_model.pt")
        print("✅ Validation loss improved — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break

# ================================================
# ✅ 1️⃣2️⃣ FINAL TEST EVALUATION
# ================================================
print("\n🔍 Loading best model for final evaluation...")
model.load_state_dict(torch.load("best_text_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for inputs, labels in tqdm(test_loader, desc="Final Test Evaluation"):
        logits = model(
            input_ids=inputs['input_ids'].to(device),
            attention_mask=inputs['attention_mask'].to(device)
        )
        loss = criterion(logits, labels.to(device))
        
        total_test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.numpy())

# Calculate final metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
cm = confusion_matrix(test_labels, test_predictions)

print("\n📊 FINAL TEST RESULTS (TEXT-ONLY):")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-Score: {f1:.4f}")
print(f"Test Loss: {total_test_loss/len(test_loader):.4f}")
print(f"\nConfusion Matrix:\n{cm}")

# ================================================
# ✅ 1️⃣3️⃣ ADDITIONAL METRICS BY CLASS
# ================================================
precision_per_class, recall_per_class, f1_per_class, support_per_class = precision_recall_fscore_support(
    test_labels, test_predictions, average=None
)

print("\n📈 PER-CLASS METRICS:")
for i in range(len(precision_per_class)):
    print(f"Class {i}: Precision={precision_per_class[i]:.4f}, Recall={recall_per_class[i]:.4f}, F1={f1_per_class[i]:.4f}, Support={support_per_class[i]}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

2025-07-07 07:54:35.561186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751874875.764690      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751874875.820876      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Train Epoch 1: 100%|██████████| 198/198 [00:17<00:00, 11.25it/s]
Validation Epoch 1: 100%|██████████| 29/29 [00:00<00:00, 36.49it/s]


Epoch [1/20] Train Loss: 1.0641 | Val Loss: 1.0214 | Val Acc: 0.5011
✅ Validation loss improved — model saved.


Train Epoch 2: 100%|██████████| 198/198 [00:17<00:00, 11.62it/s]
Validation Epoch 2: 100%|██████████| 29/29 [00:00<00:00, 38.05it/s]


Epoch [2/20] Train Loss: 0.9967 | Val Loss: 0.9621 | Val Acc: 0.5455
✅ Validation loss improved — model saved.


Train Epoch 3: 100%|██████████| 198/198 [00:17<00:00, 11.60it/s]
Validation Epoch 3: 100%|██████████| 29/29 [00:00<00:00, 38.68it/s]


Epoch [3/20] Train Loss: 0.9219 | Val Loss: 0.9815 | Val Acc: 0.5344
⏰ No improvement — patience 1/3


Train Epoch 4: 100%|██████████| 198/198 [00:17<00:00, 11.60it/s]
Validation Epoch 4: 100%|██████████| 29/29 [00:00<00:00, 39.13it/s]


Epoch [4/20] Train Loss: 0.8637 | Val Loss: 0.8923 | Val Acc: 0.6231
✅ Validation loss improved — model saved.


Train Epoch 5: 100%|██████████| 198/198 [00:17<00:00, 11.51it/s]
Validation Epoch 5: 100%|██████████| 29/29 [00:00<00:00, 38.98it/s]


Epoch [5/20] Train Loss: 0.7940 | Val Loss: 0.9398 | Val Acc: 0.6075
⏰ No improvement — patience 1/3


Train Epoch 6: 100%|██████████| 198/198 [00:17<00:00, 11.48it/s]
Validation Epoch 6: 100%|██████████| 29/29 [00:00<00:00, 39.10it/s]


Epoch [6/20] Train Loss: 0.7074 | Val Loss: 0.9383 | Val Acc: 0.5831
⏰ No improvement — patience 2/3


Train Epoch 7: 100%|██████████| 198/198 [00:17<00:00, 11.50it/s]
Validation Epoch 7: 100%|██████████| 29/29 [00:00<00:00, 38.81it/s]


Epoch [7/20] Train Loss: 0.6199 | Val Loss: 1.0752 | Val Acc: 0.5521
⏰ No improvement — patience 3/3
🛑 Early stopping triggered at epoch 7

🔍 Loading best model for final evaluation...


Final Test Evaluation: 100%|██████████| 57/57 [00:01<00:00, 38.41it/s]


📊 FINAL TEST RESULTS (TEXT-ONLY):
Test Accuracy: 0.6120
Test Precision: 0.6203
Test Recall: 0.6120
Test F1-Score: 0.6107
Test Loss: 0.9127

Confusion Matrix:
[[243 130  29]
 [ 71 248  34]
 [ 26  60  61]]

📈 PER-CLASS METRICS:
Class 0: Precision=0.7147, Recall=0.6045, F1=0.6550, Support=402
Class 1: Precision=0.5662, Recall=0.7025, F1=0.6271, Support=353
Class 2: Precision=0.4919, Recall=0.4150, F1=0.4502, Support=147



