<a href="https://colab.research.google.com/github/REZAKAZAZI00/Assessment_Backend/blob/master/nbert12_06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade transformers accelerate



In [2]:
!pip install -q datasets transformers accelerate evaluate nrclex nltk


In [3]:
import numpy as np
import pandas as pd
from datasets import load_dataset

import torch
from torch import nn

from transformers import (
    AutoTokenizer,
    BertModel,
    BertPreTrainedModel,
    TrainingArguments,
    Trainer,
    default_data_collator,
)
import evaluate

from nrclex import NRCLex
import nltk

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
dataset = load_dataset("nbertagnolli/counsel-chat")
dataset
full_ds = dataset["train"]
len(full_ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Repo card metadata block was not found. Setting CardData to empty.


2775

In [5]:
# 70% train, 30% موقت (val + test)
temp = full_ds.train_test_split(test_size=0.3, seed=42)

train_ds = temp["train"]            # 70%
temp_2   = temp["test"].train_test_split(test_size=0.5, seed=42)
val_ds   = temp_2["train"]          # 15%
test_ds  = temp_2["test"]           # 15%

len(train_ds), len(val_ds), len(test_ds)


(1942, 416, 417)

In [6]:
EMOTION_LABELS = [
    "anger",
    "fear",
    "anticipation",
    "trust",
    "surprise",
    "sadness",
    "joy",
    "disgust",
    "negative",
    "positive",
]

label2id = {e: i for i, e in enumerate(EMOTION_LABELS)}
id2label = {i: e for e, i in label2id.items()}


In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text: str) -> str:
    if text is None:
        return ""
    text = str(text).strip()
    tokens = word_tokenize(text.lower())
    # حذف stopword ها و نگه داشتن فقط حروف
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)


def nrc_features_and_label(text: str):
    """
    خروجی:
      - vec: وکتور ۱۰ بعدی نرمال‌شده احساسات NRC
      - label: لیبل احساس غالب یا None اگر هیچی match نشد
    """
    cleaned = clean_text(text)
    if not cleaned:
        return [0.0] * len(EMOTION_LABELS), None

    emo = NRCLex(cleaned)
    raw = emo.raw_emotion_scores  # dict: emotion -> count

    counts = [raw.get(e, 0) for e in EMOTION_LABELS]
    total = sum(counts)

    if total == 0:
        return [0.0] * len(EMOTION_LABELS), None

    vec = [c / total for c in counts]  # نرمال‌سازی
    dominant_idx = int(np.argmax(counts))
    dominant_label = EMOTION_LABELS[dominant_idx]
    return vec, dominant_label

In [8]:
def add_lexicon_and_label(example):
    text = example["answerText"]
    vec, label = nrc_features_and_label(text)
    example["lexicon_feats"] = vec
    example["emotion_label"] = label if label is not None else "no_emotion"
    return example

train_labeled = train_ds.map(add_lexicon_and_label)
val_labeled   = val_ds.map(add_lexicon_and_label)
test_labeled  = test_ds.map(add_lexicon_and_label)


In [9]:
def filter_no_emotion(example):
    return example["emotion_label"] != "no_emotion"

train_labeled = train_labeled.filter(filter_no_emotion)
val_labeled   = val_labeled.filter(filter_no_emotion)
test_labeled  = test_labeled.filter(filter_no_emotion)

len(train_labeled), len(val_labeled), len(test_labeled)


(1916, 414, 410)

In [10]:
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LENGTH = 256  # می‌تونی بعداً تنظیمش کنی


In [11]:
def preprocess_function(examples):
    texts = examples["answerText"]  # می‌توانی questionText + answerText را هم کانکت کنی
    # اگر می‌خواهی سوال و جواب را با هم بدهی:
    # texts = [q + " [SEP] " + a for q, a in zip(examples["questionText"], examples["answerText"])]

    tokenized = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )

    # labels
    tokenized["labels"] = [
        label2id[lab] for lab in examples["emotion_label"]
    ]

    # lexicon features
    tokenized["lexicon_feats"] = examples["lexicon_feats"]

    return tokenized

train_proc = train_labeled.map(preprocess_function, batched=True, remove_columns=train_labeled.column_names)
val_proc   = val_labeled.map(preprocess_function,   batched=True, remove_columns=val_labeled.column_names)
test_proc  = test_labeled.map(preprocess_function,  batched=True, remove_columns=test_labeled.column_names)


Map:   0%|          | 0/410 [00:00<?, ? examples/s]

In [12]:
train_proc.set_format(type="torch", columns=["input_ids", "attention_mask", "labels", "lexicon_feats"])
val_proc.set_format(type="torch",   columns=["input_ids", "attention_mask", "labels", "lexicon_feats"])
test_proc.set_format(type="torch",  columns=["input_ids", "attention_mask", "labels", "lexicon_feats"])


In [22]:
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.modeling_outputs import SequenceClassifierOutput

class NBertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.lexicon_proj = nn.Linear(len(EMOTION_LABELS), config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        lexicon_feats=None,
        labels=None,
        num_items_in_batch=None,   # ✅ ✅ ✅ این خط کل مشکل رو حل می‌کنه
        **kwargs                   # ✅ برای سازگاری آینده
    ):

        # ✅ جلوگیری از پاس داده شدن آرگومان ناشناخته به BertModel
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        # CLS token
        if outputs.pooler_output is not None:
            pooled_output = outputs.pooler_output
        else:
            pooled_output = outputs.last_hidden_state[:, 0]

        # Lexicon features
        if lexicon_feats is not None:
            lexicon_feats = lexicon_feats.to(pooled_output.dtype)
            emo_vec = self.lexicon_proj(lexicon_feats)
            pooled_output = pooled_output + emo_vec

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )


In [14]:
accuracy_metric  = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric    = evaluate.load("recall")
f1_metric        = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    prec = precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"]
    rec = recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"]

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
    }


In [20]:
training_args = TrainingArguments(
    output_dir="nbert-counsel-chat",

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,

    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.04,

    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    report_to="none"   # ✅ ✅ ✅ این خط مشکل WandB رو 100٪ حل می‌کنه
)


In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [23]:
model = NBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(EMOTION_LABELS),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_proc,
    eval_dataset=val_proc,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of NBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'lexicon_proj.bias', 'lexicon_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1237,1.06767,0.671498,0.528804,0.671498,0.558414
2,0.8925,0.991161,0.683575,0.602365,0.683575,0.627136
3,0.6895,0.941775,0.710145,0.623745,0.710145,0.649726
4,0.4885,0.997409,0.71256,0.637064,0.71256,0.661557
5,0.3818,1.02659,0.698068,0.625219,0.698068,0.650924


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=600, training_loss=0.7554169686635336, metrics={'train_runtime': 536.9018, 'train_samples_per_second': 17.843, 'train_steps_per_second': 1.118, 'total_flos': 1260516792422400.0, 'train_loss': 0.7554169686635336, 'epoch': 5.0})

In [24]:
test_results = trainer.evaluate(test_proc)
test_results

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.022717833518982,
 'eval_accuracy': 0.7048780487804878,
 'eval_precision': 0.6557932494505563,
 'eval_recall': 0.7048780487804878,
 'eval_f1': 0.6603972066235022,
 'eval_runtime': 5.6518,
 'eval_samples_per_second': 72.543,
 'eval_steps_per_second': 2.3,
 'epoch': 5.0}