#Dataset Card for "emotion"
Dataset Summary

Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.

###Install & Import

In [None]:
!pip install -q datasets bitsandbytes transformers[adapters]

[0m

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("dair-ai/emotion")

# Inspect structure
print(dataset)
print(dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})
{'text': 'i didnt feel humiliated', 'label': 0}


In [None]:
dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

###Label Mapping

In [None]:
label_names = dataset["train"].features["label"].names
print(label_names)

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


###Convert to Instruction-Style for Fine-Tuning

In [None]:
def convert_to_alpaca(example):
    return {
        "instruction": "Detect the emotion in the text",
        "input": example["text"],
        "output": label_names[example["label"]]
    }

In [None]:
train_ds = dataset["train"].map(convert_to_alpaca)
val_ds   = dataset["validation"].map(convert_to_alpaca)
test_ds  = dataset["test"].map(convert_to_alpaca)


In [None]:
print(train_ds[0])

{'text': 'i didnt feel humiliated', 'label': 0, 'instruction': 'Detect the emotion in the text', 'input': 'i didnt feel humiliated', 'output': 'sadness'}


In [None]:
# Turn rows into a single training text field for feeding to llm


def format_row(ex):
    # Keep it minimal & consistent
    return {"text": f"<|user|>\n{ex['instruction']}\n\nText: {ex['input']}\nOptions: {', '.join(label_names)}\n<|assistant|>\n{ex['output']}"}

train_text = train_ds.map(format_row)
val_text   = val_ds.map(format_row)
test_text  = test_ds.map(format_row)

train_text[0]["text"][:400]


'<|user|>\nDetect the emotion in the text\n\nText: i didnt feel humiliated\nOptions: sadness, joy, love, anger, fear, surprise\n<|assistant|>\nsadness'

###Save to JSONL (Optional, for Colab file upload)

In [None]:
import json

with open("/content/train.jsonl", "w") as f:
    for row in train_ds:
        f.write(json.dumps(row) + "\n")

with open("/content/val.jsonl", "w") as f:
    for row in val_ds:
        f.write(json.dumps(row) + "\n")


###Install deps

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig


###Config: base model

In [None]:
import os
BASE_MODEL = "Qwen/Qwen2.5-3B-Instruct"
MAX_SEQ_LEN = 512


###Load 4-bit base model + tokenizer and attach LoRA

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
bnb_cfg = BitsAndBytesConfig(
                              load_in_4bit =True,                    # use 4-bit to fit model save GPU since we are in google colab free
                              bnb_4bit_use_double_quant=True,        # extra memory saving
                              bnb_4bit_quant_type="nf4",             # recommended format
                              bnb_4bit_compute_dtype=torch.bfloat16  # safe compute type

)

In [None]:
# Load tokeninzer

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL,use_fast=True)
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = "right"


###Train with TRL SFTTrainer (efficient instruction tuning)

In [None]:
!pip install trl



In [None]:
!pip install bitsandbytes



In [None]:
from trl import SFTTrainer, SFTConfig

In [None]:
# LoRA Config

lora_cfg= LoraConfig(
                      r =16,                        # rank (size of adapter matrices)
                      lora_alpha= 32,               # scalling factor
                      lora_dropout = 0.05,          # helps to avoud overfitting
                      bias = 'none',                  # saves param
                      task_type = "CAUSAL_LM",      # this is a causal language model
                      target_modules =[
                           "q_proj","k_proj","v_proj","o_proj",   # attention layers
                           "gate_proj","up_proj","down_proj"      # feed-forward layers
                      ]
)

In [None]:
# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_cfg,
    device_map="auto"
)


In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    train_dataset = train_text,
    eval_dataset = val_text,
    peft_config=lora_cfg,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 50,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        output_dir = "outputs",
        optim= "paged_adamw_8bit",
        dataset_text_field = "text",
        eval_steps=6,
    ),
)



In [None]:
trainer.train()

trainer.save_model("./qlora-finetuned-emotion")

Step,Training Loss
1,3.9102
2,3.9091
3,3.855
4,3.1481
5,2.7644
6,2.4496
7,2.2979
8,1.3738
9,1.5851
10,1.317


#Interface for testing

In [None]:
from peft import PeftModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Define base model and max sequence length (already defined, but good to have here for clarity)
BASE_MODEL = "Qwen/Qwen2.5-3B-Instruct"
MAX_SEQ_LEN = 512

# Load the fine-tuned adapter
peft_model_id = "./qlora-finetuned-emotion"
model = PeftModel.from_pretrained(model, peft_model_id)
model.eval()

def classify(text):
    # Prepare the input in the same format as training data
    instruction = "Detect the emotion in the text"
    options = ", ".join(label_names) # label_names needs to be accessible
    prompt = f"<|user|>\n{instruction}\n\nText: {text}\nOptions: {options}\n<|assistant|>\n"

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SEQ_LEN).to(model.device)

    # Generate prediction
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,  # Generate enough tokens for the emotion label
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and extract the predicted emotion
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip()

    # The model might generate extra text, try to find the emotion label in the response
    predicted_emotion = "unknown"
    for label in label_names:
        if label in response.lower():
            predicted_emotion = label
            break
        # Also check for the full label name case-insensitively
        if label.lower() in response.lower():
            predicted_emotion = label
            break


    return predicted_emotion

In [None]:
import gradio as gr

def infer_ui(text):
    return classify(text)

demo = gr.Interface(
    fn=infer_ui,
    inputs=gr.Textbox(lines=4, label="Enter text"),
    outputs=gr.Textbox(label="Predicted emotion"),
    title="Emotion Classifier (QLoRA • Qwen2.5-3B)"
)
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0a5f5057a7a4c383ca.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Here are a few examples of text and their corresponding emotions from the dataset:

* **Text:** "i didnt feel humiliated"
  **Emotion:** sadness

* **Text:** "i am feeling a little cranky"
  **Emotion:** anger

* **Text:** "i feel like i am already in heaven"
  **Emotion:** joy

* **Text:** "i feel romantic and desire you"
  **Emotion:** love

* **Text:** "i feel like a scared child who has been abandoned"
  **Emotion:** fear

* **Text:** "i feel like i have been very lucky"
  **Emotion:** surprise