In [1]:
# ================================================
# ✅ TEXT-ONLY MODEL WITH BANGLISHBERT FOR BEST F1 SCORE
# ================================================
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import torch.nn as nn
import torch.nn.functional as F
import re
import string
import json
import unicodedata

# ================================================
# ✅ PATHS & SETUP
# ================================================
input_csv = "/kaggle/input/basem/dataset.csv"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ================================================
# ✅ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    label_converted = row['label 2'] - 1  # Convert labels to 0-based indexing
    existing_data.append({
        'Captions': row['extracted_text'],
        'Label_Sentiment': label_converted
    })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ TEXT CLEANING
# ================================================
def normalize_bangla(text):
    return unicodedata.normalize('NFC', text)

def clean_text(text):
    if pd.isna(text) or text.strip() == "":
        return ""
    # Remove URLs and HTML
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra spaces
    text = " ".join(text.split())
    # Remove irrelevant characters (keep Bangla, numbers, and spaces)
    text = re.sub(r'[^\u0980-\u09FF0-9 ]+', '', text)
    text = normalize_bangla(text)
    # Optional: Remove Bangla stopwords (uncomment if stopwords list is available)
    # text = remove_bangla_stopwords(text)
    # Remove extra spaces again
    text = " ".join(text.split())
    return text

# Apply text cleaning
processed_df['Captions'] = processed_df['Captions'].astype(str).apply(clean_text)
processed_df['label'] = processed_df['Label_Sentiment']

# ================================================
# ✅ DATA SPLITS
# ================================================
train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}, Test samples: {len(test_df)}")
print(f"Class distribution: {train_df['label'].value_counts().sort_index().tolist()}")

# ================================================
# ✅ LOAD BANGLISHBERT
# ================================================
bert_tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglishbert")
bert_model = AutoModel.from_pretrained("csebuetnlp/banglishbert")

# ================================================
# ✅ TEXT-ONLY MODEL
# ================================================
class TextOnlyModel(nn.Module):
    def __init__(self, bert_model, num_classes=3, dropout_rate=0.3):
        super().__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        # Text encoding
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        text_features = self.dropout(text_features)
        logits = self.classifier(text_features)
        return logits

# ================================================
# ✅ TEXT DATASET
# ================================================
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        caption = row['Captions']
        text_inputs = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': text_inputs['input_ids'].flatten(),
            'attention_mask': text_inputs['attention_mask'].flatten(),
            'label': torch.tensor(row['label'], dtype=torch.long)
        }

# ================================================
# ✅ DATALOADERS
# ================================================
batch_size = 8

train_dataset = TextDataset(train_df, bert_tokenizer)
val_dataset = TextDataset(val_df, bert_tokenizer)
test_dataset = TextDataset(test_df, bert_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ================================================
# ✅ INITIALIZE MODEL
# ================================================
model = TextOnlyModel(bert_model, num_classes=3, dropout_rate=0.3).to(device)

# ================================================
# ✅ LOSS & OPTIMIZER WITH ADVANCED TECHNIQUES
# ================================================
# Focal Loss for handling class imbalance
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        
        if self.alpha is not None:
            alpha_t = self.alpha[targets]
            focal_loss = alpha_t * focal_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

# Calculate class weights
class_counts = train_df['label'].value_counts().sort_index().tolist()
total_samples = sum(class_counts)
class_weights = [total_samples / count for count in class_counts]
alpha = torch.FloatTensor(class_weights).to(device)

# Use Focal Loss for better handling of class imbalance
criterion = FocalLoss(alpha=alpha, gamma=2.0)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20, eta_min=1e-6)

# ================================================
# ✅ TRAINING LOOP WITH ADVANCED TECHNIQUES
# ================================================
num_epochs = 25
patience = 3
patience_counter = 0
best_val_f1 = 0.0

print("🚀 Starting Text-Only Model Training...")

for epoch in range(num_epochs):
    # ============================================================
    # TRAINING PHASE
    # ============================================================
    model.train()
    total_train_loss = 0
    train_predictions = []
    train_labels = []

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        loss.backward()
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_train_loss += loss.item()
        
        predictions = torch.argmax(logits, dim=1)
        train_predictions.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_predictions)
    train_f1 = precision_recall_fscore_support(train_labels, train_predictions, average='weighted')[2]

    # ============================================================
    # VALIDATION PHASE
    # ============================================================
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            total_val_loss += loss.item()
            
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='weighted')
    
    # Step scheduler
    scheduler.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"  Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f} | Train F1: {train_f1:.4f}")
    print(f"  Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f} | Val F1: {val_f1:.4f}")
    print(f"  LR: {optimizer.param_groups[0]['lr']:.6f}")

    # ============================================================
    # EARLY STOPPING BASED ON F1 SCORE
    # ============================================================
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), "best_text_only_model.pt")
        print(f"✅ Validation F1 improved to {val_f1:.4f} — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break
    print("-" * 70)

# ================================================
# ✅ FINAL TEST EVALUATION
# ================================================
print("\n🔍 Loading best model for final evaluation...")
model.load_state_dict(torch.load("best_text_only_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        total_test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Calculate comprehensive metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
test_precision_macro, test_recall_macro, test_f1_macro, _ = precision_recall_fscore_support(test_labels, test_predictions, average='macro')
cm = confusion_matrix(test_labels, test_predictions)

# Per-class metrics
precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
    test_labels, test_predictions, average=None
)

print("\n" + "="*70)
print("🎯 FINAL TEXT-ONLY MODEL TEST RESULTS")
print("="*70)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1-Score (Weighted): {test_f1:.4f}")
print(f"Test F1-Score (Macro): {test_f1_macro:.4f}")
print(f"Test Precision (Weighted): {test_precision:.4f}")
print(f"Test Recall (Weighted): {test_recall:.4f}")
print(f"Test Loss: {total_test_loss/len(test_loader):.4f}")

print("\n📈 Per-Class Metrics:")
class_names = ['Negative', 'Neutral', 'Positive']
for i, class_name in enumerate(class_names):
    print(f"{class_name:>8}: Precision={precision_per_class[i]:.4f}, Recall={recall_per_class[i]:.4f}, F1={f1_per_class[i]:.4f}, Support={support[i]}")

print(f"\n🎯 Confusion Matrix:")
print(f"{'':>10} {'Neg':>6} {'Neu':>6} {'Pos':>6}")
for i, class_name in enumerate(['Negative', 'Neutral', 'Positive']):
    print(f"{class_name:>10} {cm[i][0]:>6} {cm[i][1]:>6} {cm[i][2]:>6}")

print("\n📋 Detailed Classification Report:")
print(classification_report(test_labels, test_predictions, target_names=class_names))

# ================================================
# ✅ SAVE RESULTS
# ================================================
results = {
    'test_accuracy': test_accuracy,
    'test_f1_weighted': test_f1,
    'test_f1_macro': test_f1_macro,
    'test_precision_weighted': test_precision,
    'test_recall_weighted': test_recall,
    'test_loss': total_test_loss/len(test_loader),
    'confusion_matrix': cm.tolist(),
    'per_class_metrics': {
        'precision': precision_per_class.tolist(),
        'recall': recall_per_class.tolist(),
        'f1': f1_per_class.tolist(),
        'support': support.tolist()
    }
}

with open('/kaggle/working/text_only_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\n" + "="*70)
print("✅ TEXT-ONLY MODEL TRAINING COMPLETE!")
print(f"🏆 Best F1 Score Achieved: {test_f1:.4f}")
print("📁 Results saved to 'text_only_results.json'")
print("="*70)

Using device: cuda
Train samples: 3156, Val samples: 451, Test samples: 902
Class distribution: [1404, 1237, 515]


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2025-07-10 07:40:53.658635: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752133253.833539      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752133253.882914      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

🚀 Starting Text-Only Model Training...


Train Epoch 1:   1%|▏         | 5/395 [00:01<01:14,  5.21it/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Train Epoch 1: 100%|██████████| 395/395 [00:48<00:00,  8.15it/s]
Validation Epoch 1: 100%|██████████| 57/57 [00:02<00:00, 27.53it/s]


Epoch [1/25]
  Train Loss: 1.2602 | Train Acc: 0.5944 | Train F1: 0.5985
  Val Loss: 0.9781 | Val Acc: 0.7184 | Val F1: 0.7156
  LR: 0.000020
✅ Validation F1 improved to 0.7156 — model saved.
----------------------------------------------------------------------


Train Epoch 2: 100%|██████████| 395/395 [00:47<00:00,  8.30it/s]
Validation Epoch 2: 100%|██████████| 57/57 [00:02<00:00, 27.53it/s]


Epoch [2/25]
  Train Loss: 0.7874 | Train Acc: 0.8016 | Train F1: 0.8024
  Val Loss: 1.3425 | Val Acc: 0.7273 | Val F1: 0.7182
  LR: 0.000020
✅ Validation F1 improved to 0.7182 — model saved.
----------------------------------------------------------------------


Train Epoch 3: 100%|██████████| 395/395 [00:47<00:00,  8.29it/s]
Validation Epoch 3: 100%|██████████| 57/57 [00:02<00:00, 27.78it/s]


Epoch [3/25]
  Train Loss: 0.4630 | Train Acc: 0.9040 | Train F1: 0.9039
  Val Loss: 1.8295 | Val Acc: 0.7494 | Val F1: 0.7493
  LR: 0.000019
✅ Validation F1 improved to 0.7493 — model saved.
----------------------------------------------------------------------


Train Epoch 4: 100%|██████████| 395/395 [00:47<00:00,  8.30it/s]
Validation Epoch 4: 100%|██████████| 57/57 [00:02<00:00, 27.74it/s]


Epoch [4/25]
  Train Loss: 0.2305 | Train Acc: 0.9541 | Train F1: 0.9540
  Val Loss: 2.0951 | Val Acc: 0.7472 | Val F1: 0.7483
  LR: 0.000018
⏰ No improvement — patience 1/3
----------------------------------------------------------------------


Train Epoch 5: 100%|██████████| 395/395 [00:47<00:00,  8.29it/s]
Validation Epoch 5: 100%|██████████| 57/57 [00:02<00:00, 27.62it/s]


Epoch [5/25]
  Train Loss: 0.1292 | Train Acc: 0.9778 | Train F1: 0.9778
  Val Loss: 3.4414 | Val Acc: 0.7472 | Val F1: 0.7443
  LR: 0.000017
⏰ No improvement — patience 2/3
----------------------------------------------------------------------


Train Epoch 6: 100%|██████████| 395/395 [00:47<00:00,  8.30it/s]
Validation Epoch 6: 100%|██████████| 57/57 [00:02<00:00, 27.46it/s]


Epoch [6/25]
  Train Loss: 0.1068 | Train Acc: 0.9854 | Train F1: 0.9854
  Val Loss: 3.1007 | Val Acc: 0.7650 | Val F1: 0.7657
  LR: 0.000016
✅ Validation F1 improved to 0.7657 — model saved.
----------------------------------------------------------------------


Train Epoch 7: 100%|██████████| 395/395 [00:47<00:00,  8.30it/s]
Validation Epoch 7: 100%|██████████| 57/57 [00:02<00:00, 27.64it/s]


Epoch [7/25]
  Train Loss: 0.0865 | Train Acc: 0.9886 | Train F1: 0.9886
  Val Loss: 3.0556 | Val Acc: 0.7738 | Val F1: 0.7735
  LR: 0.000015
✅ Validation F1 improved to 0.7735 — model saved.
----------------------------------------------------------------------


Train Epoch 8: 100%|██████████| 395/395 [00:47<00:00,  8.30it/s]
Validation Epoch 8: 100%|██████████| 57/57 [00:02<00:00, 27.57it/s]


Epoch [8/25]
  Train Loss: 0.0622 | Train Acc: 0.9902 | Train F1: 0.9902
  Val Loss: 3.1894 | Val Acc: 0.7605 | Val F1: 0.7630
  LR: 0.000013
⏰ No improvement — patience 1/3
----------------------------------------------------------------------


Train Epoch 9: 100%|██████████| 395/395 [00:47<00:00,  8.32it/s]
Validation Epoch 9: 100%|██████████| 57/57 [00:02<00:00, 27.95it/s]


Epoch [9/25]
  Train Loss: 0.0671 | Train Acc: 0.9895 | Train F1: 0.9895
  Val Loss: 3.4887 | Val Acc: 0.7583 | Val F1: 0.7573
  LR: 0.000012
⏰ No improvement — patience 2/3
----------------------------------------------------------------------


Train Epoch 10: 100%|██████████| 395/395 [00:47<00:00,  8.34it/s]
Validation Epoch 10: 100%|██████████| 57/57 [00:02<00:00, 28.02it/s]


Epoch [10/25]
  Train Loss: 0.0358 | Train Acc: 0.9930 | Train F1: 0.9930
  Val Loss: 3.4363 | Val Acc: 0.7627 | Val F1: 0.7640
  LR: 0.000011
⏰ No improvement — patience 3/3
🛑 Early stopping triggered at epoch 10

🔍 Loading best model for final evaluation...


Final Test Evaluation: 100%|██████████| 113/113 [00:04<00:00, 27.63it/s]


🎯 FINAL TEXT-ONLY MODEL TEST RESULTS
Test Accuracy: 0.7894
Test F1-Score (Weighted): 0.7857
Test F1-Score (Macro): 0.7474
Test Precision (Weighted): 0.7852
Test Recall (Weighted): 0.7894
Test Loss: 2.8722

📈 Per-Class Metrics:
Negative: Precision=0.8171, Recall=0.8333, F1=0.8251, Support=402
 Neutral: Precision=0.7947, Recall=0.8442, F1=0.8187, Support=353
Positive: Precision=0.6752, Recall=0.5374, F1=0.5985, Support=147

🎯 Confusion Matrix:
              Neg    Neu    Pos
  Negative    335     47     20
   Neutral     37    298     18
  Positive     38     30     79

📋 Detailed Classification Report:
              precision    recall  f1-score   support

    Negative       0.82      0.83      0.83       402
     Neutral       0.79      0.84      0.82       353
    Positive       0.68      0.54      0.60       147

    accuracy                           0.79       902
   macro avg       0.76      0.74      0.75       902
weighted avg       0.79      0.79      0.79       902


✅ TEXT-O


