In [5]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, classification_report
import torch
from torch.utils.data import Dataset
import numpy as np

In [6]:
train_data = pd.read_csv('./emotion_train.csv')
val_data = pd.read_csv('./emotion_validation.csv')
test_data = pd.read_csv('./emotion_test.csv')

In [7]:
emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [8]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [9]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

train_texts = train_data['text'].tolist()
train_labels = train_data['label'].tolist()
val_texts = val_data['text'].tolist()
val_labels = val_data['label'].tolist()
test_texts = test_data['text'].tolist()
test_labels = test_data['label'].tolist()

train_dataset = EmotionDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer, max_length=128)
test_dataset = EmotionDataset(test_texts, test_labels, tokenizer, max_length=128)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=len(emotion_labels))

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [12]:
trainer.train()

  0%|          | 0/3000 [00:00<?, ?it/s]

{'loss': 1.736, 'grad_norm': 3.422088861465454, 'learning_rate': 1.9933333333333334e-05, 'epoch': 0.01}
{'loss': 1.6838, 'grad_norm': 6.059967994689941, 'learning_rate': 1.9866666666666667e-05, 'epoch': 0.02}
{'loss': 1.7347, 'grad_norm': 9.450512886047363, 'learning_rate': 1.98e-05, 'epoch': 0.03}
{'loss': 1.6665, 'grad_norm': 4.085999488830566, 'learning_rate': 1.9733333333333336e-05, 'epoch': 0.04}
{'loss': 1.6177, 'grad_norm': 6.0765581130981445, 'learning_rate': 1.9666666666666666e-05, 'epoch': 0.05}
{'loss': 1.571, 'grad_norm': 6.820758819580078, 'learning_rate': 1.9600000000000002e-05, 'epoch': 0.06}
{'loss': 1.4835, 'grad_norm': 7.022929668426514, 'learning_rate': 1.9533333333333335e-05, 'epoch': 0.07}
{'loss': 1.455, 'grad_norm': 26.703659057617188, 'learning_rate': 1.9466666666666668e-05, 'epoch': 0.08}
{'loss': 1.3325, 'grad_norm': 68.47493743896484, 'learning_rate': 1.94e-05, 'epoch': 0.09}
{'loss': 1.3637, 'grad_norm': 16.150217056274414, 'learning_rate': 1.933333333333333

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.2692180871963501, 'eval_accuracy': 0.9125, 'eval_f1': 0.9133763201354372, 'eval_runtime': 13.563, 'eval_samples_per_second': 147.46, 'eval_steps_per_second': 9.216, 'epoch': 1.0}
{'loss': 0.2365, 'grad_norm': 49.20100021362305, 'learning_rate': 1.3266666666666668e-05, 'epoch': 1.01}
{'loss': 0.2225, 'grad_norm': 4.356011867523193, 'learning_rate': 1.3200000000000002e-05, 'epoch': 1.02}
{'loss': 0.1519, 'grad_norm': 27.05234146118164, 'learning_rate': 1.3133333333333334e-05, 'epoch': 1.03}
{'loss': 0.1193, 'grad_norm': 12.991595268249512, 'learning_rate': 1.3066666666666668e-05, 'epoch': 1.04}
{'loss': 0.2315, 'grad_norm': 13.576393127441406, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.05}
{'loss': 0.2829, 'grad_norm': 15.33315658569336, 'learning_rate': 1.2933333333333334e-05, 'epoch': 1.06}
{'loss': 0.1614, 'grad_norm': 6.408905029296875, 'learning_rate': 1.2866666666666667e-05, 'epoch': 1.07}
{'loss': 0.2038, 'grad_norm': 11.83803653717041, 'learning_rate': 1.

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.19843173027038574, 'eval_accuracy': 0.929, 'eval_f1': 0.9286921870177579, 'eval_runtime': 13.4479, 'eval_samples_per_second': 148.722, 'eval_steps_per_second': 9.295, 'epoch': 2.0}
{'loss': 0.2572, 'grad_norm': 37.28451919555664, 'learning_rate': 6.600000000000001e-06, 'epoch': 2.01}
{'loss': 0.1008, 'grad_norm': 6.705952167510986, 'learning_rate': 6.533333333333334e-06, 'epoch': 2.02}
{'loss': 0.2049, 'grad_norm': 1.7621207237243652, 'learning_rate': 6.466666666666667e-06, 'epoch': 2.03}
{'loss': 0.1511, 'grad_norm': 8.033997535705566, 'learning_rate': 6.4000000000000006e-06, 'epoch': 2.04}
{'loss': 0.1481, 'grad_norm': 40.760379791259766, 'learning_rate': 6.333333333333333e-06, 'epoch': 2.05}
{'loss': 0.0856, 'grad_norm': 5.293247222900391, 'learning_rate': 6.266666666666668e-06, 'epoch': 2.06}
{'loss': 0.2024, 'grad_norm': 30.550016403198242, 'learning_rate': 6.200000000000001e-06, 'epoch': 2.07}
{'loss': 0.1551, 'grad_norm': 0.1978943943977356, 'learning_rate': 6.13

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.16054922342300415, 'eval_accuracy': 0.9385, 'eval_f1': 0.938676513529829, 'eval_runtime': 13.5911, 'eval_samples_per_second': 147.155, 'eval_steps_per_second': 9.197, 'epoch': 3.0}
{'train_runtime': 9093.5713, 'train_samples_per_second': 5.278, 'train_steps_per_second': 0.33, 'train_loss': 0.33017677734295525, 'epoch': 3.0}


TrainOutput(global_step=3000, training_loss=0.33017677734295525, metrics={'train_runtime': 9093.5713, 'train_samples_per_second': 5.278, 'train_steps_per_second': 0.33, 'total_flos': 3157446057984000.0, 'train_loss': 0.33017677734295525, 'epoch': 3.0})

In [13]:
results = trainer.evaluate(test_dataset)
print("Evaluation Results:", results)

test_logits = trainer.predict(test_dataset).predictions
y_pred = np.argmax(test_logits, axis=1)
print(classification_report(test_labels, y_pred, target_names=emotion_labels))

model.save_pretrained("./emotion_model")
tokenizer.save_pretrained("./emotion_model")

  0%|          | 0/125 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.16807608306407928, 'eval_accuracy': 0.929, 'eval_f1': 0.9285057334865384, 'eval_runtime': 13.6803, 'eval_samples_per_second': 146.195, 'eval_steps_per_second': 9.137, 'epoch': 3.0}


  0%|          | 0/125 [00:00<?, ?it/s]

              precision    recall  f1-score   support

     sadness       0.96      0.97      0.97       581
         joy       0.95      0.95      0.95       695
        love       0.83      0.81      0.82       159
       anger       0.94      0.92      0.93       275
        fear       0.86      0.94      0.90       224
    surprise       0.80      0.68      0.74        66

    accuracy                           0.93      2000
   macro avg       0.89      0.88      0.88      2000
weighted avg       0.93      0.93      0.93      2000



('./emotion_model\\tokenizer_config.json',
 './emotion_model\\special_tokens_map.json',
 './emotion_model\\sentencepiece.bpe.model',
 './emotion_model\\added_tokens.json',
 './emotion_model\\tokenizer.json')

In [14]:
loaded_model = AutoModelForSequenceClassification.from_pretrained("./emotion_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("./emotion_model")

In [15]:
def predict_emotion(texts):
    encodings = loaded_tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )
    outputs = loaded_model(**encodings)
    predictions = torch.argmax(outputs.logits, axis=1).numpy()
    return [emotion_labels[pred] for pred in predictions]

new_texts = [
    "I feel really happy today!",
    "I'm so angry right now!",
    "This is the worst day ever.",
]
predictions = predict_emotion(new_texts)
for text, emotion in zip(new_texts, predictions):
    print(f"Text: {text} => Predicted Emotion: {emotion}")

Text: I feel really happy today! => Predicted Emotion: joy
Text: I'm so angry right now! => Predicted Emotion: anger
Text: This is the worst day ever. => Predicted Emotion: sadness


In [1]:
#Hugging face upload model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

tokenizer = AutoTokenizer.from_pretrained("Rajkumar57/englishsentiment-model")
model = AutoModelForSequenceClassification.from_pretrained("Rajkumar57/englishsentiment-model")

def predict_emotion(texts):
    encodings = tokenizer(
        texts,
        padding=True,       
        truncation=True,    
        max_length=128,     
        return_tensors="pt" 
    )

    outputs = model(**encodings)

    predictions = torch.argmax(outputs.logits, axis=1).numpy()

    return [emotion_labels[pred] for pred in predictions]

new_texts = [
    "I feel really happy today!",
    "I'm so angry right now!",
    "This is the worst day ever.",
]

predictions = predict_emotion(new_texts)

for text, emotion in zip(new_texts, predictions):
    print(f"Text: {text} => Predicted Emotion: {emotion}")

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Text: I feel really happy today! => Predicted Emotion: joy
Text: I'm so angry right now! => Predicted Emotion: anger
Text: This is the worst day ever. => Predicted Emotion: sadness
