In [4]:
# !pip install torch transformers datasets evaluate scikit-learn
!pip install onnx onnxruntime optimum evaluate



In [5]:
CORE_LABELS = ["positive", "negative", "surprise", "neutral"]

In [6]:
LABELS = ["positive", "negative", "surprise", "neutral"]
LABEL2ID = {k: v for v, k in enumerate(LABELS)}
ID2LABEL = {i: l for l, i in LABEL2ID.items()}

In [7]:
LABEL2ID = {
    "positive": 0,
    "negative": 1,
    "surprise": 2,
    "neutral": 3
}

GO_MAP = {
    "joy": "positive",
    "love": "positive",
    "gratitude": "positive",
    "admiration": "positive",
    "approval": "positive",
    "optimism": "positive",
    "pride": "positive",
    "relief": "positive",
    "excitement": "positive",
    "amusement": "positive",
    "caring": "positive",
    "desire": "positive",

    "anger": "negative",
    "annoyance": "negative",
    "disappointment": "negative",
    "disapproval": "negative",
    "disgust": "negative",
    "embarrassment": "negative",
    "fear": "negative",
    "grief": "negative",
    "nervousness": "negative",
    "remorse": "negative",
    "sadness": "negative",
    "confusion": "negative",

    "surprise": "surprise",
    "curiosity": "surprise",
    "realization": "surprise",

    "neutral": "neutral"
}

In [8]:
VIGO_MAP = {
    # positive
    0:"positive", 1:"positive", 2:"positive", 3:"positive",
    4:"positive", 5:"positive", 6:"positive", 7:"positive",
    8:"positive", 9:"positive", 10:"positive", 11:"positive",

    # surprise group
    12:"surprise", 13:"surprise", 14:"surprise",

    # negative
    15:"negative", 16:"negative", 17:"negative", 18:"negative",
    19:"negative", 20:"negative", 21:"negative", 22:"negative",
    23:"negative", 24:"negative", 25:"negative", 26:"negative",

    # neutral
    27:"neutral"
}

In [9]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

In [10]:
from huggingface_hub import login
login(HF_TOKEN)

In [11]:
from datasets import load_dataset

go = load_dataset("mrm8488/goemotions")
vi = load_dataset("sonlam1102/vigoemotions", token=True)

README.md: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


goemotions.csv:   0%|          | 0.00/42.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211225 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/922 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/1.75M [00:00<?, ?B/s]

val.csv:   0%|          | 0.00/219k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/227k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16531 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2067 [00:00<?, ? examples/s]

In [12]:
GO_EMOTION_COLUMNS = [
    'admiration','amusement','anger','annoyance','approval','caring',
    'confusion','curiosity','desire','disappointment','disapproval',
    'disgust','embarrassment','excitement','fear','gratitude','grief',
    'joy','love','nervousness','optimism','pride','realization','relief',
    'remorse','sadness','surprise','neutral'
]

def convert_go(example):
    mapped = []

    for emotion in GO_EMOTION_COLUMNS:
        if example[emotion] == 1:
            mapped.append(GO_MAP[emotion])

    # Collapse multi-label
    if "negative" in mapped:
        final = "negative"
    elif "positive" in mapped:
        final = "positive"
    elif "surprise" in mapped:
        final = "surprise"
    else:
        final = "neutral"

    return {
        "text": example["text"],
        "label": LABEL2ID[final]
    }

In [13]:
go_columns = go["train"].column_names

for split in go.keys():
    go[split] = go[split].map(
        convert_go,
        remove_columns=go_columns
    )
    go[split] = go[split].filter(lambda x: x is not None)

Map:   0%|          | 0/211225 [00:00<?, ? examples/s]

Filter:   0%|          | 0/211225 [00:00<?, ? examples/s]

In [14]:
# for split in go.keys():
#     go[split] = go[split].map(
#         convert_go,
#         remove_columns=go[split].column_names
#     )

In [15]:
print(go["train"].column_names)
print(go["train"][0])

['text', 'label']
{'text': 'That game hurt.', 'label': 1}


In [16]:
import ast

def convert_vi(example):
    label_list = ast.literal_eval(example["labels"])  # convert "[12]" → [12]

    mapped = []

    for idx in label_list:
        mapped.append(VIGO_MAP[idx])

    # collapse multi-label
    if "negative" in mapped:
        final = "negative"
    elif "positive" in mapped:
        final = "positive"
    elif "surprise" in mapped:
        final = "surprise"
    else:
        final = "neutral"

    return {
        "text": example["text"],
        "label": LABEL2ID[final]
    }

In [17]:
for split in vi.keys():
    vi[split] = vi[split].map(
        convert_vi,
        remove_columns=vi[split].column_names
    )

Map:   0%|          | 0/16531 [00:00<?, ? examples/s]

Map:   0%|          | 0/2066 [00:00<?, ? examples/s]

Map:   0%|          | 0/2067 [00:00<?, ? examples/s]

In [18]:
print("Go: ", go["train"].features)
print("Vi: ", vi["train"].features)

Go:  {'text': Value('string'), 'label': Value('int64')}
Vi:  {'text': Value('string'), 'label': Value('int64')}


In [19]:
from collections import Counter

print("Go:", Counter(go["train"]["label"]))
print("Vi:", Counter(vi["train"]["label"]))

Go: Counter({0: 76245, 1: 61448, 3: 58709, 2: 14823})
Vi: Counter({1: 8768, 0: 6505, 2: 688, 3: 570})


In [20]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "microsoft/Multilingual-MiniLM-L12-H384"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(CORE_LABELS),
    id2label=ID2LABEL,
    label2id=LABEL2ID
)

config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: microsoft/Multilingual-MiniLM-L12-H384
Key               | Status  | 
------------------+---------+-
classifier.weight | MISSING | 
classifier.bias   | MISSING | 

[3mNotes:
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [21]:
def tokenize(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

go = go.map(tokenize, batched=True)
vi = vi.map(tokenize, batched=True)

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

Map:   0%|          | 0/211225 [00:00<?, ? examples/s]

Map:   0%|          | 0/16531 [00:00<?, ? examples/s]

Map:   0%|          | 0/2066 [00:00<?, ? examples/s]

Map:   0%|          | 0/2067 [00:00<?, ? examples/s]

In [22]:
import transformers
print(transformers.__version__)

5.2.0


In [23]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./emotion_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True
)

In [24]:
print(go.keys())

dict_keys(['train'])


In [25]:
go = go["train"].train_test_split(test_size=0.1, seed=42)

print(go.keys())

dict_keys(['train', 'test'])


Phase 1: GoEmotions

In [26]:
from sklearn.metrics import f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    return {
        "accuracy": (preds == labels).mean(),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
        "f1_macro": f1_score(labels, preds, average="macro"),
    }

In [None]:
from transformers import Trainer
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": metric.compute(predictions=preds, references=labels)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=go["train"],
    eval_dataset=go["test"],
    compute_metrics=compute_metrics
)

trainer.train()

Downloading builder script: 0.00B [00:00, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.945507,0.923651,{'accuracy': 0.6125550347961938}


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Phase 2: ViGoEmotions

In [None]:
trainer.train_dataset = vi["train"]
trainer.eval_dataset = vi["validation"]

trainer.train()

In [None]:
from optimum.onnxruntime import ORTModelForSequenceClassification
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

model.save_pretrained("./emotion_model")
tokenizer.save_pretrained("./emotion_model")

ort_model = ORTModelForSequenceClassification.from_pretrained(
    "./emotion_model",
    export=True
)

ort_model.save_pretrained("./emotion_model_onnx")

quantizer = ORTQuantizer.from_pretrained("./emotion_model_onnx")

qconfig = AutoQuantizationConfig.avx512_vnni(
    is_static=False,
    per_channel=False
)

quantizer.quantize(
    save_dir="./emotion_model_int8",
    quantization_config=qconfig
)

In [None]:
from optimum.onnxruntime import ORTModelForSequenceClassification
import torch

MODEL_PATH = "./emotion_model_int8"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = ORTModelForSequenceClassification.from_pretrained(MODEL_PATH)

def predict(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=64
    )

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    pred = torch.argmax(logits, dim=-1).item()

    return ID2LABEL[pred]


# TEST
samples = [
    "Cảm ơn bạn nhiều nhé!",
    "Mình thấy buồn quá.",
    "Sao mãi không được vậy?",
    "Ủa cái này là sao ta?",
    "Ok vậy cũng được."
]

for s in samples:
    print(s, "→", predict(s))