<a href="https://colab.research.google.com/github/REZAKAZAZI00/nbert/blob/main/nbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets torch accelerate evaluate scikit-learn nltk


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import os
import re
import json
import math
from pathlib import Path
from typing import Dict, List, Any, Optional

import torch
from torch import nn
from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    AutoConfig,
    set_seed,
)
from datasets import load_dataset, DatasetDict, load_from_disk
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score
import evaluate

import nltk
nltk.download("punkt")
nltk.download("wordnet")
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Config
MODEL_NAME = "bert-base-uncased"
DATASET_NAME = "nbertagnolli/counsel-chat"  # Hugging Face dataset id used in your notebook
NRC_PATH = "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"  # must exist locally
OUTPUT_DIR = "./nbert_counsel_run"
MAX_LEN = 256
BATCH_SIZE = 16
LR = 3e-5
EPOCHS = 10
SEED = 42
set_seed(SEED)

In [None]:
# Emotions considered by NRC (common set)
NRC_EMOTIONS = ["anger","anticipation","disgust","fear","joy","sadness","surprise","trust","positive","negative"]

# -----------------------------
# Helper: load NRC lexicon
# -----------------------------
def load_nrc_lexicon(path: str) -> Dict[str, List[str]]:
    """
    Load NRC word-level lexicon file into dict: word -> list of emotions it maps to.
    Expected format (NRC file): word \t emotion \t association(0/1)
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"NRC lexicon not found at {path}. Please download and place it there.")
    lex = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) != 3:
                continue
            word, emotion, assoc = parts
            if assoc == '1':
                lex.setdefault(word, []).append(emotion)
    return lex

nrc_lex = load_nrc_lexicon(NRC_PATH)
lemmatizer = WordNetLemmatizer()

def sentence_to_nrc_vector(text: str, lex: Dict[str,List[str]]) -> np.ndarray:
    """
    Return an emotion count vector of length len(NRC_EMOTIONS)
    """
    tokens = word_tokenize(text.lower())
    vec = np.zeros(len(NRC_EMOTIONS), dtype=float)
    for t in tokens:
        t = re.sub(r'\W+', '', t)
        if not t:
            continue
        t_lem = lemmatizer.lemmatize(t)
        emos = lex.get(t_lem) or lex.get(t)
        if emos:
            for e in emos:
                if e in NRC_EMOTIONS:
                    vec[NRC_EMOTIONS.index(e)] += 1.0
    # normalize (if any)
    s = vec.sum()
    if s > 0:
        vec = vec / s
    return vec

In [None]:
class CounselChatEmotionDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, max_len=256, nrc_lexicon=None):
        self.ds = hf_dataset
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.nrc = nrc_lexicon

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        row = self.ds[idx]
        # Adjust this depending on dataset fields. We'll try common candidates.
        text = None
        for key in ["text", "utterance", "content", "transcript", "dialogue", "message"]:
            if key in row:
                text = row[key]
                break
        if text is None:
            # fallback: stringify whole row
            text = " ".join(str(v) for v in row.values())

        # create NRC vector
        nrc_vec = sentence_to_nrc_vector(text, self.nrc) if self.nrc else np.zeros(len(NRC_EMOTIONS), dtype=float)

        tok = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_len,
            return_tensors=None,
        )
        # Label: try common fields
        label = None
        for lab_key in ["label", "emotion", "emotion_label", "emo"]:
            if lab_key in row:
                label = row[lab_key]
                break
        # If dataset is unlabeled, we will set label to -1
        if label is None:
            label = -1

        return {
            "input_ids": tok["input_ids"],
            "attention_mask": tok["attention_mask"],
            "nrc_vec": nrc_vec.astype(np.float32),
            "label": int(label) if isinstance(label, (int, np.integer)) else label
        }

In [None]:
class NbertLikeModel(nn.Module):
    def __init__(self, model_name: str, nrc_dim: int, num_labels: int):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size  # typically 768
        self.nrc_dim = nrc_dim
        self.num_labels = num_labels

        # fuse CLS embedding with NRC vector (project NRC to hidden_size then combine)
        self.nrc_proj = nn.Linear(nrc_dim, hidden_size)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, input_ids=None, attention_mask=None, nrc_vec=None, labels=None):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        cls = bert_out.last_hidden_state[:,0,:]  # [B, hidden]
        if nrc_vec is None:
            nrc_vec = torch.zeros((cls.size(0), self.nrc_dim), device=cls.device)
        else:
            nrc_vec = nrc_vec.to(cls.device).float()

        nrc_proj = self.nrc_proj(nrc_vec)  # [B, hidden]
        cat = torch.cat([cls, nrc_proj], dim=-1)  # [B, hidden*2]

        logits = self.classifier(cat)
        loss = None
        if labels is not None and labels.dim() == 1:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return {"loss": loss, "logits": logits}


In [None]:
pip install -U "transformers>=4.30.0"


Collecting transformers>=4.30.0
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.2
    Uninstalling transformers-4.57.2:
      Successfully uninstalled transformers-4.57.2
Successfully installed transformers-4.57.3


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Loading dataset...", DATASET_NAME)
raw = load_dataset(DATASET_NAME)

# این دیتاست فقط یک split به نام train دارد، پس خودمان train/test split می‌سازیم
if "train" in raw and len(raw) == 1:
    print("Dataset has only 'train' split. Creating train/test split...")
    ds = raw["train"].train_test_split(test_size=0.2, seed=SEED)
else:
    ds = raw

# ساخت train و test
train_ds = CounselChatEmotionDataset(ds["train"], tokenizer, max_len=MAX_LEN, nrc_lexicon=nrc_lex)
val_ds   = CounselChatEmotionDataset(ds["test"], tokenizer, max_len=MAX_LEN, nrc_lexicon=nrc_lex)

# اگر دیتاست validation داشت، آن را جایگزین test می‌کنیم
if "validation" in ds:
    val_ds = CounselChatEmotionDataset(ds["validation"], tokenizer, max_len=MAX_LEN, nrc_lexicon=nrc_lex)

# ---------------------------
# تعیین تعداد لیبل‌ها
# ---------------------------
num_labels = 2  # پیش‌فرض
try:
    hf_features = ds["train"].features
    if "label" in hf_features:
        lab = hf_features["label"]
        if hasattr(lab, "num_classes"):
            num_labels = int(lab.num_classes)
        else:
            labels = ds["train"]["label"]
            num_labels = len(set(labels))
except Exception:
    pass

print("Num labels inferred:", num_labels)

# ---------------------------
# مدل
# ---------------------------
model = NbertLikeModel(MODEL_NAME, nrc_dim=len(NRC_EMOTIONS), num_labels=num_labels)

# ---------------------------
# Collate Function
# ---------------------------
def collate_fn(batch):
    input_ids = [b["input_ids"] for b in batch]
    attention_masks = [b["attention_mask"] for b in batch]
    nrc_vecs = np.stack([b["nrc_vec"] for b in batch])
    labels = [b["label"] for b in batch]

    tok = tokenizer.pad({"input_ids": input_ids, "attention_mask": attention_masks},
                        return_tensors="pt")
    tok["nrc_vec"] = torch.tensor(nrc_vecs, dtype=torch.float32)
    tok["labels"] = torch.tensor(labels, dtype=torch.long)
    return tok

# ---------------------------
# Dataset Wrapper
# ---------------------------
class HFDatasetWrapper(torch.utils.data.Dataset):
    def __init__(self, ds_obj):
        self.ds_obj = ds_obj

    def __len__(self):
        return len(self.ds_obj)

    def __getitem__(self, idx):
        return self.ds_obj[idx]

train_wrapper = HFDatasetWrapper(train_ds)
val_wrapper   = HFDatasetWrapper(val_ds)

# ---------------------------
# Training Arguments
# ---------------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    weight_decay=0.04,
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    fp16=torch.cuda.is_available()
)

# ---------------------------
# Custom Trainer
# ---------------------------
class SimpleTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        nrc_vec = inputs.pop("nrc_vec")
        outputs = model(**inputs, nrc_vec=nrc_vec, labels=labels)
        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss

trainer = SimpleTrainer(
    model=model,
    args=training_args,
    train_dataset=train_wrapper,
    eval_dataset=val_wrapper,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=None
)

# ---------------------------
# Train
# ---------------------------
trainer.train()

# ---------------------------
# Evaluation
# ---------------------------
print("Evaluating...")
preds_output = trainer.predict(val_wrapper)
logits = preds_output.predictions
y_pred = np.argmax(logits, axis=1)
y_true = preds_output.label_ids

print("Classification report:")
try:
    print(classification_report(y_true, y_pred, digits=4))
except Exception as e:
    print("Could not print classification report:", e)

# ---------------------------
# Save
# ---------------------------
trainer.save_model(OUTPUT_DIR)
print("Done. Model saved to", OUTPUT_DIR)
