In [1]:
# ================================================
# ✅ 1️⃣ LIBRARIES & SETUP
# ================================================
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import re
import string

# ================================================
# ✅ 2️⃣ PATHS
# ================================================
input_csv = "/kaggle/input/basem/dataset.csv"

# ================================================
# ✅ 3️⃣ LOAD & PREPROCESS CSV
# ================================================
df = pd.read_csv(input_csv)

existing_data = []
for _, row in df.iterrows():
    label_converted = row['label 2'] - 1
    existing_data.append({
        'Captions': row['extracted_text'],
        'Label_Sentiment': label_converted
    })

processed_df = pd.DataFrame(existing_data)

# ================================================
# ✅ 4️⃣ TEXT CLEANING
# ================================================
def clean_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join(text.split())
    return text

train_df, temp_df = train_test_split(processed_df, test_size=0.3, stratify=processed_df['Label_Sentiment'], random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['Label_Sentiment'], random_state=42)

for df_name, df_ in [('train', train_df), ('test', test_df), ('val', val_df)]:
    df_['Captions'] = df_['Captions'].astype(str).apply(clean_text)
    df_['label'] = df_['Label_Sentiment']
    df_.to_csv(f'/kaggle/working/{df_name}_cleaned.csv', index=False)

# ================================================
# ✅ 5️⃣ LOAD BERT MODEL & TOKENIZER
# ================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")
bert_model = AutoModel.from_pretrained("sagorsarker/bangla-bert-base").to(device)

# ================================================
# ✅ 6️⃣ TEXT-ONLY DATASET
# ================================================
class TextOnlyDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        caption = str(row['Captions'])
        label = row['label']
        
        # Tokenize text
        inputs = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# ================================================
# ✅ 7️⃣ DATALOADERS
# ================================================
batch_size = 16

train_dataset = TextOnlyDataset(train_df, bert_tokenizer)
val_dataset = TextOnlyDataset(val_df, bert_tokenizer)
test_dataset = TextOnlyDataset(test_df, bert_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ================================================
# ✅ 8️⃣ BiLSTM MODEL
# ================================================
class BiLSTMSentimentClassifier(nn.Module):
    def __init__(self, bert_model, hidden_dim=256, num_layers=2, dropout=0.3, num_classes=3):
        super(BiLSTMSentimentClassifier, self).__init__()
        
        self.bert = bert_model
        self.bert_hidden_size = bert_model.config.hidden_size
        
        # Freeze BERT parameters (optional - you can unfreeze for fine-tuning)
        for param in self.bert.parameters():
            param.requires_grad = False
        
        # BiLSTM layer
        self.bilstm = nn.LSTM(
            input_size=self.bert_hidden_size,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True,
            bidirectional=True
        )
        
        # Attention mechanism
        self.attention = nn.Linear(hidden_dim * 2, 1)
        
        # Classifier
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dim * 2, num_classes)
        
    def forward(self, input_ids, attention_mask):
        # Get BERT embeddings
        with torch.no_grad():
            bert_outputs = self.bert(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
        
        # Get sequence embeddings
        sequence_output = bert_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
        
        # Pass through BiLSTM
        lstm_output, _ = self.bilstm(sequence_output)  # (batch_size, seq_len, hidden_dim * 2)
        
        # Apply attention mechanism
        attention_weights = torch.softmax(self.attention(lstm_output), dim=1)  # (batch_size, seq_len, 1)
        
        # Apply attention mask to prevent attending to padding tokens
        attention_mask_expanded = attention_mask.unsqueeze(-1).float()  # (batch_size, seq_len, 1)
        attention_weights = attention_weights * attention_mask_expanded
        
        # Normalize attention weights
        attention_weights = attention_weights / (attention_weights.sum(dim=1, keepdim=True) + 1e-10)
        
        # Apply attention to get final representation
        attended_output = torch.sum(lstm_output * attention_weights, dim=1)  # (batch_size, hidden_dim * 2)
        
        # Apply dropout and classify
        output = self.dropout(attended_output)
        logits = self.classifier(output)
        
        return logits

# ================================================
# ✅ 9️⃣ INITIALIZE MODEL
# ================================================
model = BiLSTMSentimentClassifier(bert_model, hidden_dim=256, num_layers=2, dropout=0.3, num_classes=3).to(device)

# ================================================
# ✅ 🔟 LOSS & OPTIMIZER
# ================================================
class_weights = train_df['label'].value_counts().sort_index().tolist()
total = sum(class_weights)
weights = [total / c for c in class_weights]
criterion = nn.CrossEntropyLoss(weight=torch.FloatTensor(weights).to(device))
optimizer = AdamW(model.parameters(), lr=2e-5)

# ================================================
# ✅ 1️⃣1️⃣ TRAINING LOOP
# ================================================
num_epochs = 15
patience = 5
patience_counter = 0
best_val_loss = float('inf')

for epoch in range(num_epochs):
    # ============================================================
    # TRAINING PHASE
    # ============================================================
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # ============================================================
    # VALIDATION PHASE
    # ============================================================
    model.eval()
    total_val_loss = 0
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            total_val_loss += loss.item()
            
            # Store predictions for metrics
            predictions = torch.argmax(logits, dim=1)
            val_predictions.extend(predictions.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_labels, val_predictions)
    
    print(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    # ============================================================
    # EARLY STOPPING CHECK
    # ============================================================
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_text_model.pt")
        print("✅ Validation loss improved — model saved.")
    else:
        patience_counter += 1
        print(f"⏰ No improvement — patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"🛑 Early stopping triggered at epoch {epoch+1}")
            break

# ================================================
# ✅ 1️⃣2️⃣ FINAL TEST EVALUATION
# ================================================
print("\n🔍 Loading best model for final evaluation...")
model.load_state_dict(torch.load("best_text_model.pt"))
model.eval()

test_predictions = []
test_labels = []
total_test_loss = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        total_test_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

# Calculate final metrics
test_accuracy = accuracy_score(test_labels, test_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_predictions, average='weighted')
cm = confusion_matrix(test_labels, test_predictions)

print("\n📊 FINAL TEST RESULTS (Text-Only BiLSTM):")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-Score: {f1:.4f}")
print(f"Test Loss: {total_test_loss/len(test_loader):.4f}")
print(f"\nConfusion Matrix:\n{cm}")

# ================================================
# ✅ 1️⃣3️⃣ DETAILED CLASSIFICATION REPORT
# ================================================
from sklearn.metrics import classification_report

print("\n📈 DETAILED CLASSIFICATION REPORT:")
target_names = ['Negative', 'Neutral', 'Positive']  # Adjust based on your labels
print(classification_report(test_labels, test_predictions, target_names=target_names))

# ================================================
# ✅ 1️⃣4️⃣ SAVE PREDICTIONS
# ================================================
results_df = pd.DataFrame({
    'true_labels': test_labels,
    'predicted_labels': test_predictions
})
results_df.to_csv('/kaggle/working/text_bilstm_predictions.csv', index=False)
print("\n💾 Predictions saved to 'text_bilstm_predictions.csv'")

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

2025-07-07 07:37:20.400067: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751873840.578360      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751873840.629843      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Train Epoch 1: 100%|██████████| 198/198 [00:18<00:00, 10.82it/s]
Validation Epoch 1: 100%|██████████| 29/29 [00:02<00:00, 14.32it/s]


Epoch [1/15] Train Loss: 1.0711 | Val Loss: 1.0376 | Val Acc: 0.5743
✅ Validation loss improved — model saved.


Train Epoch 2: 100%|██████████| 198/198 [00:17<00:00, 11.25it/s]
Validation Epoch 2: 100%|██████████| 29/29 [00:01<00:00, 14.50it/s]


Epoch [2/15] Train Loss: 0.9802 | Val Loss: 0.9545 | Val Acc: 0.5322
✅ Validation loss improved — model saved.


Train Epoch 3: 100%|██████████| 198/198 [00:17<00:00, 11.25it/s]
Validation Epoch 3: 100%|██████████| 29/29 [00:02<00:00, 14.46it/s]


Epoch [3/15] Train Loss: 0.8947 | Val Loss: 0.9116 | Val Acc: 0.5854
✅ Validation loss improved — model saved.


Train Epoch 4: 100%|██████████| 198/198 [00:17<00:00, 11.24it/s]
Validation Epoch 4: 100%|██████████| 29/29 [00:01<00:00, 14.50it/s]


Epoch [4/15] Train Loss: 0.8529 | Val Loss: 0.9051 | Val Acc: 0.6164
✅ Validation loss improved — model saved.


Train Epoch 5: 100%|██████████| 198/198 [00:17<00:00, 11.25it/s]
Validation Epoch 5: 100%|██████████| 29/29 [00:02<00:00, 14.45it/s]


Epoch [5/15] Train Loss: 0.8152 | Val Loss: 0.8769 | Val Acc: 0.6142
✅ Validation loss improved — model saved.


Train Epoch 6: 100%|██████████| 198/198 [00:17<00:00, 11.25it/s]
Validation Epoch 6: 100%|██████████| 29/29 [00:02<00:00, 14.46it/s]


Epoch [6/15] Train Loss: 0.7829 | Val Loss: 0.8729 | Val Acc: 0.6408
✅ Validation loss improved — model saved.


Train Epoch 7: 100%|██████████| 198/198 [00:17<00:00, 11.24it/s]
Validation Epoch 7: 100%|██████████| 29/29 [00:02<00:00, 14.47it/s]


Epoch [7/15] Train Loss: 0.7450 | Val Loss: 0.8658 | Val Acc: 0.6231
✅ Validation loss improved — model saved.


Train Epoch 8: 100%|██████████| 198/198 [00:17<00:00, 11.24it/s]
Validation Epoch 8: 100%|██████████| 29/29 [00:02<00:00, 14.49it/s]


Epoch [8/15] Train Loss: 0.7156 | Val Loss: 0.8764 | Val Acc: 0.6341
⏰ No improvement — patience 1/5


Train Epoch 9: 100%|██████████| 198/198 [00:17<00:00, 11.32it/s]
Validation Epoch 9: 100%|██████████| 29/29 [00:01<00:00, 14.68it/s]


Epoch [9/15] Train Loss: 0.6813 | Val Loss: 0.8825 | Val Acc: 0.6031
⏰ No improvement — patience 2/5


Train Epoch 10: 100%|██████████| 198/198 [00:17<00:00, 11.32it/s]
Validation Epoch 10: 100%|██████████| 29/29 [00:02<00:00, 14.29it/s]


Epoch [10/15] Train Loss: 0.6476 | Val Loss: 0.9079 | Val Acc: 0.6608
⏰ No improvement — patience 3/5


Train Epoch 11: 100%|██████████| 198/198 [00:17<00:00, 11.21it/s]
Validation Epoch 11: 100%|██████████| 29/29 [00:02<00:00, 14.39it/s]


Epoch [11/15] Train Loss: 0.6103 | Val Loss: 0.9158 | Val Acc: 0.6475
⏰ No improvement — patience 4/5


Train Epoch 12: 100%|██████████| 198/198 [00:17<00:00, 11.18it/s]
Validation Epoch 12: 100%|██████████| 29/29 [00:02<00:00, 14.31it/s]


Epoch [12/15] Train Loss: 0.5543 | Val Loss: 0.9439 | Val Acc: 0.6142
⏰ No improvement — patience 5/5
🛑 Early stopping triggered at epoch 12

🔍 Loading best model for final evaluation...


Final Test Evaluation: 100%|██████████| 57/57 [00:04<00:00, 14.18it/s]


📊 FINAL TEST RESULTS (Text-Only BiLSTM):
Test Accuracy: 0.6353
Test Precision: 0.6591
Test Recall: 0.6353
Test F1-Score: 0.6422
Test Loss: 0.8428

Confusion Matrix:
[[272  78  52]
 [ 69 208  76]
 [ 18  36  93]]

📈 DETAILED CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    Negative       0.76      0.68      0.71       402
     Neutral       0.65      0.59      0.62       353
    Positive       0.42      0.63      0.51       147

    accuracy                           0.64       902
   macro avg       0.61      0.63      0.61       902
weighted avg       0.66      0.64      0.64       902


💾 Predictions saved to 'text_bilstm_predictions.csv'



