In [5]:
# Install required packages
!pip install -q transformers scikit-learn matplotlib seaborn

zsh:1: command not found: pip


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# ============================================================
# DistilBERT ‚Äî Leakage-Free Pipeline (Aligned with BiGRU/CNN)
# 1. Split: Train (85%) + Test (15%) ‚Üê test frozen
# 2. Grid search via 3-Fold CV on Train (85%)
# 3. Final retrain on full oversampled Train (85%)
# 4. Evaluate once on Test (15%)
# ============================================================

import os
import random
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler  # ‚úÖ Added for consistency

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup
)

# --- Reproducibility ---
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# --- Load Data ---
df = pd.read_csv('newextendeddataset.csv', encoding='utf-8-sig')

# --- Class Mapping (unchanged) ---
y_dict = {
    'self direction': 0, 'stimulation': 1, 'hedonism': 2, 'achievement': 3, 'power': 4,
    'security': 5, 'conformity': 6, 'tradition': 7, 'benevolence': 8, 'universalism': 9
}
class_labels = [k for k in y_dict.keys()]
n_classes = len(class_labels)

df['category'] = df['category'].str.strip().str.lower()
df = df[df['category'].isin(y_dict)]
df['label_id'] = df['category'].map(y_dict).astype(int)

print("Classes (fixed order):")
for i, name in enumerate(class_labels):
    print(f"{i} ‚Üí {name}")

# --- üîë REVISED SPLIT: 85% Train / 15% Test (NO VAL SET) ---
texts = df['Base_Reviews'].values
y_int = df['label_id'].values

X_train_raw, X_test_raw, y_train_raw, y_test = train_test_split(
    texts, y_int, test_size=0.15, random_state=SEED, stratify=y_int
)

print(f"\n‚úÖ Final Splits: Train (85%): {len(X_train_raw)}, Test (15%): {len(X_test_raw)}")

# --- Tokenizer (unchanged ‚Äî static, no leakage risk) ---
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_texts(texts, max_len):
    return tokenizer(
        [str(t) for t in texts],
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt',
        return_attention_mask=True
    )

# --- Dataset wrapper (unchanged) ---
class TextDS(Dataset):
    def __init__(self, toks, labels):
        self.ids = toks['input_ids']
        self.msk = toks['attention_mask']
        self.y = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return {
            'input_ids': self.ids[i],
            'attention_mask': self.msk[i],
            'labels': self.y[i]
        }

# --- Train/Eval utilities (unchanged) ---
def train_one_run(train_loader, val_loader, epochs, lr, weight_decay=0.01):
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased', num_labels=n_classes
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )

    best_val_acc = -1.0
    best_state = None
    history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}

    for ep in range(epochs):
        # Train
        model.train()
        tr_loss, tr_correct, tr_seen = 0.0, 0, 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = batch['input_ids'].to(device)
            masks = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            tr_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            tr_correct += (preds == labels).sum().item()
            tr_seen += labels.size(0)

        avg_tr_loss = tr_loss / len(train_loader)
        avg_tr_acc = tr_correct / max(1, tr_seen)
        history['train_loss'].append(avg_tr_loss)
        history['train_acc'].append(avg_tr_acc)

        # Validate
        model.eval()
        val_loss, val_correct, val_seen = 0.0, 0, 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = batch['input_ids'].to(device)
                masks = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(inputs, attention_mask=masks, labels=labels)
                val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                val_correct += (preds == labels).sum().item()
                val_seen += labels.size(0)

        avg_val_loss = val_loss / len(val_loader)
        avg_val_acc = val_correct / max(1, val_seen)
        history['val_loss'].append(avg_val_loss)
        history['val_acc'].append(avg_val_acc)

        if avg_val_acc > best_val_acc + 1e-6:
            best_val_acc = avg_val_acc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict({k: v.to(device) for k, v in best_state.items()})
    return model, best_val_acc, history

def evaluate_on_loader(model, data_loader):
    model.eval()
    y_true, y_pred = [], []
    total_loss = 0.0
    with torch.no_grad():
        for batch in data_loader:
            inputs = batch['input_ids'].to(device)
            masks = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs, attention_mask=masks, labels=labels)
            total_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    acc = (y_true == y_pred).mean()
    avg_loss = total_loss / len(data_loader)
    return acc, avg_loss, y_true, y_pred

# =========================
# GRID SEARCH + 3-FOLD CV ON TRAIN (85%)
# =========================
print(f"\nüîç Grid Search + 3-Fold CV on TRAIN (85%)...")

LR_LIST = [1e-5, 2e-5, 3e-5]
BATCH_LIST = [8, 16]
EPOCHS_LIST = [6, 8]
MAXLEN = 128

grid_results = []
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

for lr in LR_LIST:
    for batch_size in BATCH_LIST:
        for epochs in EPOCHS_LIST:
            print(f"  ‚Üí Trying: lr={lr}, bs={batch_size}, ep={epochs}")
            cv_scores = []
            
            for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train_raw, y_train_raw), 1):
                # Split
                X_tr_raw = X_train_raw[tr_idx]
                X_v_raw = X_train_raw[val_idx]
                y_tr_raw = y_train_raw[tr_idx]
                y_v_raw = y_train_raw[val_idx]
                
                # ‚úÖ Oversample ONLY fold-train
                ros = RandomOverSampler(random_state=SEED)
                X_tr_res, y_tr_res = ros.fit_resample(X_tr_raw.reshape(-1, 1), y_tr_raw)
                X_tr_res = X_tr_res.flatten()  # back to list of strings
                
                # Tokenize (no fitting ‚Äî DistilBERT tokenizer is static)
                tok_tr = tokenize_texts(X_tr_res, MAXLEN)  # oversampled!
                tok_v = tokenize_texts(X_v_raw, MAXLEN)    # original val
                
                train_ds = TextDS(tok_tr, y_tr_res)
                val_ds = TextDS(tok_v, y_v_raw)
                
                train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
                val_loader = DataLoader(val_ds, batch_size=batch_size)
                
                # Train (fixed epochs for fairness)
                model, val_acc, _ = train_one_run(
                    train_loader, val_loader, epochs=epochs, lr=lr
                )
                cv_scores.append(val_acc)
            
            mean_cv = np.mean(cv_scores)
            std_cv = np.std(cv_scores)
            print(f"    ‚Üí CV Acc: {mean_cv:.4f} ¬± {std_cv:.4f}")
            
            grid_results.append({
                'lr': lr, 'batch_size': batch_size, 'epochs': epochs,
                'cv_mean_acc': mean_cv, 'cv_std_acc': std_cv
            })

# Select best by mean CV accuracy
grid_df = pd.DataFrame(grid_results).sort_values('cv_mean_acc', ascending=False)
best = grid_df.iloc[0]
print(f"\n‚úÖ Best HPs by 3-Fold CV Acc ({best['cv_mean_acc']:.4f} ¬± {best['cv_std_acc']:.4f}):")
print({k: v for k, v in best.items() if k not in ['cv_mean_acc', 'cv_std_acc']})

# =========================
# 5-FOLD CV ON TRAIN (85%) WITH BEST HPs (Final Estimate)
# =========================
print(f"\nüöÄ 5-Fold CV on TRAIN (85%) with best HPs...")

skf_final = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cv_results = []

for fold, (tr_idx, val_idx) in enumerate(skf_final.split(X_train_raw, y_train_raw), 1):
    print(f"\n--- Fold {fold}/5 ---")
    
    X_tr_raw = X_train_raw[tr_idx]
    X_v_raw = X_train_raw[val_idx]
    y_tr_raw = y_train_raw[tr_idx]
    y_v_raw = y_train_raw[val_idx]
    
    # ‚úÖ Oversample ONLY fold-train
    ros = RandomOverSampler(random_state=SEED)
    X_tr_res, y_tr_res = ros.fit_resample(X_tr_raw.reshape(-1, 1), y_tr_raw)
    X_tr_res = X_tr_res.flatten()
    
    tok_tr = tokenize_texts(X_tr_res, MAXLEN)
    tok_v = tokenize_texts(X_v_raw, MAXLEN)
    
    train_ds = TextDS(tok_tr, y_tr_res)
    val_ds = TextDS(tok_v, y_v_raw)
    
    train_loader = DataLoader(train_ds, batch_size=int(best['batch_size']), shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=int(best['batch_size']))
    
    model, val_acc, _ = train_one_run(
        train_loader, val_loader, 
        epochs=int(best['epochs']), 
        lr=float(best['lr'])
    )
    print(f"  ‚Üí Fold {fold} Val Acc: {val_acc:.4f}")
    cv_results.append({'fold': fold, 'val_acc': val_acc})

# Summarize CV
cv_df = pd.DataFrame(cv_results)
mean_cv = cv_df['val_acc'].mean()
std_cv = cv_df['val_acc'].std()
print(f"\nüìä 5-Fold CV on TRAIN (85%): {mean_cv:.4f} ¬± {std_cv:.4f}")

# =========================
# FINAL MODEL: Retrain on FULL OVERSAMPLED TRAIN (85%)
# =========================
print(f"\nüéØ Retraining final DistilBERT on FULL TRAIN (85%)...")

# ‚úÖ Oversample FULL TRAIN only
ros_final = RandomOverSampler(random_state=SEED)
X_train_res, y_train_res = ros_final.fit_resample(X_train_raw.reshape(-1, 1), y_train_raw)
X_train_res = X_train_res.flatten()

# Tokenize
tok_train_final = tokenize_texts(X_train_res, MAXLEN)
tok_test_final = tokenize_texts(X_test_raw, MAXLEN)

train_ds_final = TextDS(tok_train_final, y_train_res)
test_ds_final = TextDS(tok_test_final, y_test)

train_loader_final = DataLoader(train_ds_final, batch_size=int(best['batch_size']), shuffle=True)
test_loader_final = DataLoader(test_ds_final, batch_size=int(best['batch_size']))

# Train final model
final_model, _, final_history = train_one_run(
    train_loader_final, 
    DataLoader(TextDS(tokenize_texts(X_train_raw[:500], MAXLEN), y_train_raw[:500]), 
               batch_size=int(best['batch_size'])),  # small internal val for early stopping
    epochs=int(best['epochs']), 
    lr=float(best['lr'])
)

# Save
SAVE_DIR = './distilbert_best_cv'
os.makedirs(SAVE_DIR, exist_ok=True)
final_model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"\n‚úÖ Saved best model + tokenizer to: {SAVE_DIR}")

# =========================
# FINAL EVALUATION ON TEST SET (15%)
# =========================
test_acc, test_loss, y_true_test, y_pred_test = evaluate_on_loader(final_model, test_loader_final)
print(f"\nüü© FINAL TEST ACCURACY (Held-Out, Natural Distribution): {test_acc:.4f}")

print("\n=== FINAL CLASSIFICATION REPORT (TEST SET) ===")
print(classification_report(y_true_test, y_pred_test, target_names=class_labels, digits=4))

# Confusion Matrix
cm = confusion_matrix(y_true_test, y_pred_test)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels)
plt.title('DistilBERT (10-Class) Confusion Matrix ‚Äî Final Test')
plt.xlabel('Predicted'); plt.ylabel('True')
plt.tight_layout()
plt.savefig('DistilBERT_10class_ConfusionMatrix_Test.png', dpi=300, bbox_inches='tight')
plt.show()

# Learning Curves
plt.figure(figsize=(13, 5))
epochs = range(1, len(final_history['train_loss']) + 1)

plt.subplot(1, 2, 1)
plt.plot(epochs, final_history['train_loss'], 'b-o', label='Train Loss')
plt.plot(epochs, final_history['val_loss'], 'r-s', label='Val Loss')
plt.title('Loss'); plt.xlabel('Epoch'); plt.legend(); plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(epochs, final_history['train_acc'], 'b-o', label='Train Acc')
plt.plot(epochs, final_history['val_acc'], 'r-s', label='Val Acc')
plt.title('Accuracy'); plt.xlabel('Epoch'); plt.legend(); plt.grid(True)

plt.suptitle('DistilBERT Final Training')
plt.tight_layout()
plt.savefig('DistilBERT_10class_LearningCurves.png', dpi=300, bbox_inches='tight')
plt.show()

Classes (fixed order):
0 ‚Üí self direction
1 ‚Üí stimulation
2 ‚Üí hedonism
3 ‚Üí achievement
4 ‚Üí power
5 ‚Üí security
6 ‚Üí conformity
7 ‚Üí tradition
8 ‚Üí benevolence
9 ‚Üí universalism

‚úÖ Final Splits: Train (85%): 9469, Test (15%): 1671

üîç Grid Search + 3-Fold CV on TRAIN (85%)...
  ‚Üí Trying: lr=1e-05, bs=8, ep=6


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-strea

    ‚Üí CV Acc: 0.7356 ¬± 0.0015
  ‚Üí Trying: lr=1e-05, bs=8, ep=8


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-strea

    ‚Üí CV Acc: 0.7375 ¬± 0.0004
  ‚Üí Trying: lr=1e-05, bs=16, ep=6


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-strea

    ‚Üí CV Acc: 0.7261 ¬± 0.0075
  ‚Üí Trying: lr=1e-05, bs=16, ep=8


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-strea

    ‚Üí CV Acc: 0.7288 ¬± 0.0022
  ‚Üí Trying: lr=2e-05, bs=8, ep=6


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-strea

    ‚Üí CV Acc: 0.7434 ¬± 0.0053
  ‚Üí Trying: lr=2e-05, bs=8, ep=8


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-strea

    ‚Üí CV Acc: 0.7466 ¬± 0.0042
  ‚Üí Trying: lr=2e-05, bs=16, ep=6


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-strea

    ‚Üí CV Acc: 0.7355 ¬± 0.0050
  ‚Üí Trying: lr=2e-05, bs=16, ep=8


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-strea

    ‚Üí CV Acc: 0.7473 ¬± 0.0035
  ‚Üí Trying: lr=3e-05, bs=8, ep=6


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-strea

    ‚Üí CV Acc: 0.7477 ¬± 0.0025
  ‚Üí Trying: lr=3e-05, bs=8, ep=8


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ============================================================
# BERT (base-uncased) ‚Äî Leakage-Free Pipeline (Aligned with BiGRU/CNN/DistilBERT)
# 1. Split: Train (85%) + Test (15%) ‚Üê test frozen
# 2. Grid search via 3-Fold CV on Train (85%)
# 3. Final retrain on full oversampled Train (85%)
# 4. Evaluate once on Test (15%)
# ============================================================

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler  # ‚úÖ Added for consistency

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup
)

# --- Reproducibility ---
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# --- Load & Map Labels (unchanged) ---
df = pd.read_csv('newextendeddataset.csv', encoding='utf-8-sig')

y_dict = {
    'self direction': 0, 'stimulation': 1, 'hedonism': 2, 'achievement': 3, 'power': 4,
    'security': 5, 'conformity': 6, 'tradition': 7, 'benevolence': 8, 'universalism': 9
}
class_labels = [k for k in y_dict.keys()]
n_classes = len(class_labels)

df['category'] = df['category'].str.strip().str.lower()
df = df[df['category'].isin(y_dict)]
df['label_id'] = df['category'].map(y_dict).astype(int)

# --- üîë REVISED SPLIT: 85% Train / 15% Test (NO VAL SET) ---
texts = df['Base_Reviews'].values
y_int = df['label_id'].values

X_train_raw, X_test_raw, y_train_raw, y_test = train_test_split(
    texts, y_int, test_size=0.15, random_state=SEED, stratify=y_int
)

print(f"\n‚úÖ Final Splits: Train (85%): {len(X_train_raw)}, Test (15%): {len(X_test_raw)}")

# --- Tokenizer (unchanged ‚Äî static) ---
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize_texts(texts, max_len):
    return tokenizer(
        [str(t) for t in texts],
        padding='max_length', truncation=True, max_length=max_len,
        return_tensors='pt', return_attention_mask=True
    )

# --- Dataset wrapper (unchanged) ---
class TextDS(Dataset):
    def __init__(self, toks, labels):
        self.ids = toks['input_ids']
        self.msk = toks['attention_mask']
        self.y = torch.tensor(labels, dtype=torch.long)
    def __len__(self): return len(self.y)
    def __getitem__(self, i):
        return {
            'input_ids': self.ids[i],
            'attention_mask': self.msk[i],
            'labels': self.y[i]
        }

# --- Train/Eval utilities (unchanged) ---
def train_one_run(train_loader, val_loader, epochs, lr, weight_decay=0.01):
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased', num_labels=n_classes
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )

    best_val_acc = -1.0
    best_state = None
    history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}

    for ep in range(epochs):
        # Train
        model.train()
        tr_loss, tr_correct, tr_seen = 0.0, 0, 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs = batch['input_ids'].to(device)
            masks = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            tr_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            tr_correct += (preds == labels).sum().item()
            tr_seen += labels.size(0)

        avg_tr_loss = tr_loss / len(train_loader)
        avg_tr_acc = tr_correct / max(1, tr_seen)
        history['train_loss'].append(avg_tr_loss)
        history['train_acc'].append(avg_tr_acc)

        # Validate
        model.eval()
        val_loss, val_correct, val_seen = 0.0, 0, 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = batch['input_ids'].to(device)
                masks = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(inputs, attention_mask=masks, labels=labels)
                val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                val_correct += (preds == labels).sum().item()
                val_seen += labels.size(0)

        avg_val_loss = val_loss / len(val_loader)
        avg_val_acc = val_correct / max(1, val_seen)
        history['val_loss'].append(avg_val_loss)
        history['val_acc'].append(avg_val_acc)

        if avg_val_acc > best_val_acc + 1e-6:
            best_val_acc = avg_val_acc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict({k: v.to(device) for k, v in best_state.items()})
    return model, best_val_acc, history

def evaluate_on_loader(model, data_loader):
    model.eval()
    y_true, y_pred = [], []
    total_loss = 0.0
    with torch.no_grad():
        for batch in data_loader:
            inputs = batch['input_ids'].to(device)
            masks = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs, attention_mask=masks, labels=labels)
            total_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    acc = (y_true == y_pred).mean()
    avg_loss = total_loss / len(data_loader)
    return acc, avg_loss, y_true, y_pred

# =========================
# GRID SEARCH + 3-FOLD CV ON TRAIN (85%)
# =========================
print(f"\nüîç Grid Search + 3-Fold CV on TRAIN (85%)...")

LR_LIST = [2e-5, 3e-5, 5e-5]
BATCH_LIST = [8, 16]
EPOCHS_LIST = [6]
MAXLEN = 128

grid_results = []
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

for lr in LR_LIST:
    for batch_size in BATCH_LIST:
        for epochs in EPOCHS_LIST:
            print(f"  ‚Üí Trying: lr={lr}, bs={batch_size}, ep={epochs}")
            cv_scores = []
            
            for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train_raw, y_train_raw), 1):
                # Split
                X_tr_raw = X_train_raw[tr_idx]
                X_v_raw = X_train_raw[val_idx]
                y_tr_raw = y_train_raw[tr_idx]
                y_v_raw = y_train_raw[val_idx]
                
                # ‚úÖ Oversample ONLY fold-train
                ros = RandomOverSampler(random_state=SEED)
                X_tr_res, y_tr_res = ros.fit_resample(X_tr_raw.reshape(-1, 1), y_tr_raw)
                X_tr_res = X_tr_res.flatten()
                
                # Tokenize (no fitting ‚Äî BERT tokenizer is static)
                tok_tr = tokenize_texts(X_tr_res, MAXLEN)  # oversampled!
                tok_v = tokenize_texts(X_v_raw, MAXLEN)    # original val
                
                train_ds = TextDS(tok_tr, y_tr_res)
                val_ds = TextDS(tok_v, y_v_raw)
                
                train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
                val_loader = DataLoader(val_ds, batch_size=batch_size)
                
                # Train (fixed epochs for fairness)
                model, val_acc, _ = train_one_run(
                    train_loader, val_loader, epochs=epochs, lr=lr
                )
                cv_scores.append(val_acc)
            
            mean_cv = np.mean(cv_scores)
            std_cv = np.std(cv_scores)
            print(f"    ‚Üí CV Acc: {mean_cv:.4f} ¬± {std_cv:.4f}")
            
            grid_results.append({
                'lr': lr, 'batch_size': batch_size, 'epochs': epochs,
                'cv_mean_acc': mean_cv, 'cv_std_acc': std_cv
            })

# Select best by mean CV accuracy
grid_df = pd.DataFrame(grid_results).sort_values('cv_mean_acc', ascending=False)
best = grid_df.iloc[0]
print(f"\n‚úÖ Best HPs by 3-Fold CV Acc ({best['cv_mean_acc']:.4f} ¬± {best['cv_std_acc']:.4f}):")
print({k: v for k, v in best.items() if k not in ['cv_mean_acc', 'cv_std_acc']})

# =========================
# 5-FOLD CV ON TRAIN (85%) WITH BEST HPs (Final Estimate)
# =========================
print(f"\nüöÄ 5-Fold CV on TRAIN (85%) with best HPs...")

skf_final = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cv_results = []

for fold, (tr_idx, val_idx) in enumerate(skf_final.split(X_train_raw, y_train_raw), 1):
    print(f"\n--- Fold {fold}/5 ---")
    
    X_tr_raw = X_train_raw[tr_idx]
    X_v_raw = X_train_raw[val_idx]
    y_tr_raw = y_train_raw[tr_idx]
    y_v_raw = y_train_raw[val_idx]
    
    # ‚úÖ Oversample ONLY fold-train
    ros = RandomOverSampler(random_state=SEED)
    X_tr_res, y_tr_res = ros.fit_resample(X_tr_raw.reshape(-1, 1), y_tr_raw)
    X_tr_res = X_tr_res.flatten()
    
    tok_tr = tokenize_texts(X_tr_res, MAXLEN)
    tok_v = tokenize_texts(X_v_raw, MAXLEN)
    
    train_ds = TextDS(tok_tr, y_tr_res)
    val_ds = TextDS(tok_v, y_v_raw)
    
    train_loader = DataLoader(train_ds, batch_size=int(best['batch_size']), shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=int(best['batch_size']))
    
    model, val_acc, _ = train_one_run(
        train_loader, val_loader, 
        epochs=int(best['epochs']), 
        lr=float(best['lr'])
    )
    print(f"  ‚Üí Fold {fold} Val Acc: {val_acc:.4f}")
    cv_results.append({'fold': fold, 'val_acc': val_acc})

# Summarize CV
cv_df = pd.DataFrame(cv_results)
mean_cv = cv_df['val_acc'].mean()
std_cv = cv_df['val_acc'].std()
print(f"\nüìä 5-Fold CV on TRAIN (85%): {mean_cv:.4f} ¬± {std_cv:.4f}")

# =========================
# FINAL MODEL: Retrain on FULL OVERSAMPLED TRAIN (85%)
# =========================
print(f"\nüéØ Retraining final BERT on FULL TRAIN (85%)...")

# ‚úÖ Oversample FULL TRAIN only
ros_final = RandomOverSampler(random_state=SEED)
X_train_res, y_train_res = ros_final.fit_resample(X_train_raw.reshape(-1, 1), y_train_raw)
X_train_res = X_train_res.flatten()

# Tokenize
tok_train_final = tokenize_texts(X_train_res, MAXLEN)
tok_test_final = tokenize_texts(X_test_raw, MAXLEN)

train_ds_final = TextDS(tok_train_final, y_train_res)
test_ds_final = TextDS(tok_test_final, y_test)

train_loader_final = DataLoader(train_ds_final, batch_size=int(best['batch_size']), shuffle=True)
test_loader_final = DataLoader(test_ds_final, batch_size=int(best['batch_size']))

# Train final model
final_model, _, final_history = train_one_run(
    train_loader_final, 
    DataLoader(TextDS(tokenize_texts(X_train_raw[:500], MAXLEN), y_train_raw[:500]), 
               batch_size=int(best['batch_size'])),  # small internal val for early stopping
    epochs=int(best['epochs']), 
    lr=float(best['lr'])
)

# Save
SAVE_DIR = './bert_best_cv'
os.makedirs(SAVE_DIR, exist_ok=True)
final_model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"\n‚úÖ Saved best BERT model + tokenizer to: {SAVE_DIR}")

# =========================
# FINAL EVALUATION ON TEST SET (15%)
# =========================
test_acc, test_loss, y_true_test, y_pred_test = evaluate_on_loader(final_model, test_loader_final)
print(f"\nüü© FINAL TEST ACCURACY (Held-Out, Natural Distribution): {test_acc:.4f}")

print("\n=== FINAL CLASSIFICATION REPORT (TEST SET) ===")
print(classification_report(y_true_test, y_pred_test, target_names=class_labels, digits=4))

# Confusion Matrix
cm = confusion_matrix(y_true_test, y_pred_test)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels)
plt.title('BERT (10-Class) Confusion Matrix ‚Äî Final Test')
plt.xlabel('Predicted'); plt.ylabel('True')
plt.tight_layout()
plt.savefig('BERT_10class_ConfusionMatrix_Test.png', dpi=300, bbox_inches='tight')
plt.show()

# Learning Curves
plt.figure(figsize=(13, 5))
epochs = range(1, len(final_history['train_loss']) + 1)

plt.subplot(1, 2, 1)
plt.plot(epochs, final_history['train_loss'], 'b-o', label='Train Loss')
plt.plot(epochs, final_history['val_loss'], 'r-s', label='Val Loss')
plt.title('Loss'); plt.xlabel('Epoch'); plt.legend(); plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(epochs, final_history['train_acc'], 'b-o', label='Train Acc')
plt.plot(epochs, final_history['val_acc'], 'r-s', label='Val Acc')
plt.title('Accuracy'); plt.xlabel('Epoch'); plt.legend(); plt.grid(True)

plt.suptitle('BERT Final Training')
plt.tight_layout()
plt.savefig('BERT_10class_LearningCurves.png', dpi=300, bbox_inches='tight')
plt.show()