In [1]:
!pip install datasets transformers torch -q

from datasets import load_dataset

dataset = load_dataset("Pranav8435/emotionDataset")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


finalized_emotion_train_3.csv: 0.00B [00:00, ?B/s]

finalized_emotion_validation_3.csv: 0.00B [00:00, ?B/s]

finalized_emotion_test_3.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/13513 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1843 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3839 [00:00<?, ? examples/s]

In [2]:
def rename_and_lower(example):
    return {
        "text": example["Situation"],
        "emotion": example["emotion"].lower()
    }

dataset = dataset.map(
    rename_and_lower,
    remove_columns=["Unnamed: 0", "Situation"]
)


Map:   0%|          | 0/13513 [00:00<?, ? examples/s]

Map:   0%|          | 0/1843 [00:00<?, ? examples/s]

Map:   0%|          | 0/3839 [00:00<?, ? examples/s]

In [3]:
emotion_map = {
    "sadness": "sadness",
    "surprise": "surprise",
    "anger/frustration": "anger",
    "fear/anxiety": "fear",
    "happiness/contentment": "happiness",
    "love/trust": "love",
    "embarrassment": "sadness",
    "jealousy": "anger",
    "sentimentality": "love",
    "preparedness": "surprise"
}

def normalize_emotion(example):
    example["emotion"] = emotion_map[example["emotion"]]
    return example

dataset = dataset.map(normalize_emotion)



Map:   0%|          | 0/13513 [00:00<?, ? examples/s]

Map:   0%|          | 0/1843 [00:00<?, ? examples/s]

Map:   0%|          | 0/3839 [00:00<?, ? examples/s]

In [4]:
labels = dataset["train"].unique("emotion")
labels = sorted(labels)

label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

num_labels = len(labels)

print(label2id)




{'anger': 0, 'fear': 1, 'happiness': 2, 'love': 3, 'sadness': 4, 'surprise': 5}


In [5]:
def encode_labels(example):
    example["label"] = label2id[example["emotion"]]
    return example

dataset = dataset.map(encode_labels)



Map:   0%|          | 0/13513 [00:00<?, ? examples/s]

Map:   0%|          | 0/1843 [00:00<?, ? examples/s]

Map:   0%|          | 0/3839 [00:00<?, ? examples/s]

In [6]:
import re

def clean_text(example):
    text = example["text"].lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    example["text"] = text
    return example

dataset = dataset.map(clean_text)



Map:   0%|          | 0/13513 [00:00<?, ? examples/s]

Map:   0%|          | 0/1843 [00:00<?, ? examples/s]

Map:   0%|          | 0/3839 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_dataset = dataset.map(tokenize, batched=True)

tokenized_dataset = tokenized_dataset.remove_columns(["text", "emotion", "id"])
tokenized_dataset.set_format("torch")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/13513 [00:00<?, ? examples/s]

Map:   0%|          | 0/1843 [00:00<?, ? examples/s]

Map:   0%|          | 0/3839 [00:00<?, ? examples/s]

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./emotion_model",
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    num_train_epochs=5,  # Early stopping will stop earlier
    weight_decay=0.02,

    load_best_model_at_end=True,          # REQUIRED
    metric_for_best_model="eval_loss",    # REQUIRED
    greater_is_better=False,              # REQUIRED

    logging_steps=100,
    report_to="none"
)



In [10]:
from transformers import Trainer, EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)



  trainer = Trainer(


In [11]:

trainer.train()


Epoch,Training Loss,Validation Loss
1,0.8468,0.800702
2,0.6202,0.762871
3,0.4576,0.778762


TrainOutput(global_step=2535, training_loss=0.6966623490613828, metrics={'train_runtime': 466.6368, 'train_samples_per_second': 144.791, 'train_steps_per_second': 9.054, 'total_flos': 1342619736565248.0, 'train_loss': 0.6966623490613828, 'epoch': 3.0})

In [12]:
print(model.config.num_labels)
print(model.config.label2id)
print(model.config.id2label)


6
{'anger': 0, 'fear': 1, 'happiness': 2, 'love': 3, 'sadness': 4, 'surprise': 5}
{0: 'anger', 1: 'fear', 2: 'happiness', 3: 'love', 4: 'sadness', 5: 'surprise'}


In [13]:
trainer.evaluate()


{'eval_loss': 0.7628709673881531,
 'eval_runtime': 6.1951,
 'eval_samples_per_second': 297.493,
 'eval_steps_per_second': 18.724,
 'epoch': 3.0}

In [15]:

from google.colab import drive
drive.mount('/content/drive')

trainer.save_model("/content/drive/MyDrive/final_emotion_model")
tokenizer.save_pretrained("/content/drive/MyDrive/final_emotion_model")


Mounted at /content/drive


('/content/drive/MyDrive/final_emotion_model/tokenizer_config.json',
 '/content/drive/MyDrive/final_emotion_model/special_tokens_map.json',
 '/content/drive/MyDrive/final_emotion_model/vocab.txt',
 '/content/drive/MyDrive/final_emotion_model/added_tokens.json',
 '/content/drive/MyDrive/final_emotion_model/tokenizer.json')

In [16]:
!ls /content/drive/MyDrive/final_emotion_model


config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
model.safetensors  tokenizer_config.json    training_args.bin


In [17]:
from transformers import pipeline

emotion_pipeline = pipeline(
    "text-classification",
    model="/content/drive/MyDrive/final_emotion_model",
    tokenizer="/content/drive/MyDrive/final_emotion_model",
    device=0,
    top_k=None
)


Device set to use cuda:0


In [18]:
emotion_pipeline("I am extremely happy today!")


[[{'label': 'happiness', 'score': 0.9574832916259766},
  {'label': 'surprise', 'score': 0.026300285011529922},
  {'label': 'fear', 'score': 0.006768739316612482},
  {'label': 'love', 'score': 0.0054257712326943874},
  {'label': 'sadness', 'score': 0.0023605111055076122},
  {'label': 'anger', 'score': 0.001661384361796081}]]