# Emotion Classification

In [1]:
!pip install -q datasets wandb

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer, TrainingArguments
    )
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import wandb
import torch
from huggingface_hub import login

In [3]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33m6633026621[0m ([33m6633026621-chulalongkorn-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
login

In [5]:
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6).to('cuda')

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
dataset = load_dataset('KittiphopKhankaew/aina-emotion-classification')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 1040
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 260
    })
})

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/260 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1040
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 260
    })
})

In [8]:
train_dataset = tokenized_dataset['train']
val_dataset = tokenized_dataset['test']

In [9]:
wandb.init(project="aina")
training_args = TrainingArguments(
    output_dir="./KittiphopKhankaew/Aina-emotion-classification-WangChanBERTa",
    eval_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50, # Number of epochs
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="steps",
    save_total_limit=2,
    report_to="wandb",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    gradient_accumulation_steps=2,
    fp16=torch.cuda.is_available(),
    seed=42
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
        "f1": f1_score(labels, preds, average="weighted")
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


[34m[1mwandb[0m: Currently logged in as: [33m6633026621[0m ([33m6633026621-chulalongkorn-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,1.6886,1.452932,0.388462,0.464852,0.388462,0.304433
100,1.3569,0.934803,0.796154,0.815725,0.796154,0.790008
150,0.7753,0.358559,0.896154,0.900044,0.896154,0.89442
200,0.3148,0.140748,0.953846,0.953884,0.953846,0.953643
250,0.147,0.130564,0.965385,0.965872,0.965385,0.964937
300,0.0831,0.13349,0.965385,0.965762,0.965385,0.965049
350,0.0426,0.13593,0.965385,0.965407,0.965385,0.965095
400,0.0227,0.129597,0.969231,0.9692,0.969231,0.969006
450,0.0272,0.128039,0.973077,0.973012,0.973077,0.97282
500,0.0094,0.133311,0.973077,0.973012,0.973077,0.97282


TrainOutput(global_step=1600, training_loss=0.15462937990203499, metrics={'train_runtime': 413.8083, 'train_samples_per_second': 125.662, 'train_steps_per_second': 3.867, 'total_flos': 3317423324921856.0, 'train_loss': 0.15462937990203499, 'epoch': 48.49230769230769})

In [10]:
trainer.push_to_hub("Update KittiphopKhankaew/Aina-emotion-classification-WangChanBERTa")

model.safetensors:   0%|          | 0.00/421M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/KittiphopKhankaew/Aina-emotion-classification-WangChanBERTa/commit/8a5ac9501c01f595d095f12d12e0e330403f8eb3', commit_message='Update KittiphopKhankaew/Aina-emotion-classification-WangChanBERTa', commit_description='', oid='8a5ac9501c01f595d095f12d12e0e330403f8eb3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KittiphopKhankaew/Aina-emotion-classification-WangChanBERTa', endpoint='https://huggingface.co', repo_type='model', repo_id='KittiphopKhankaew/Aina-emotion-classification-WangChanBERTa'), pr_revision=None, pr_num=None)

# Inference

In [11]:
model = AutoModelForSequenceClassification.from_pretrained("KittiphopKhankaew/Aina-emotion-classification-WangChanBERTa")
tokenizer = AutoTokenizer.from_pretrained("KittiphopKhankaew/Aina-emotion-classification-WangChanBERTa")

In [16]:
# Input text
text = "สวัสดีค่ะ ฉันชื่อว่าไอนะ ยินดีที่ได้รู้จักนะครับ"

# Tokenize input
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Perform inference
outputs = model(**inputs)
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

# Map class to emotion
label_mapping = {0: "Idle/Happy", 1: "Smirk", 2: "Shocked/Surprised", 3: "Sad", 4: "Disgust", 5: "Angry/Mad"}
predicted_emotion = label_mapping[predicted_class]

# Output prediction
print(f"Predicted class: {predicted_class}")
print(f"Predicted emotion: {predicted_emotion}")


Predicted class: 0
Predicted emotion: Idle/Happy
