In [1]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("../data/rich/emotion_rich_cleaned.csv")

# Show the first few rows
df.head()


Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [2]:
df.columns

Index(['Unnamed: 0', 'text', 'label'], dtype='object')

In [3]:
df["label"].value_counts()


label
1    141067
0    121187
3     57317
4     47712
2     34554
Name: count, dtype: int64

In [5]:
import pandas as pd

# Load all three parts of the simple dataset
df_train = pd.read_csv("../data/simple/training.csv")
df_test = pd.read_csv("../data/simple/test.csv")
df_val = pd.read_csv("../data/simple/validation.csv")

# Combine them into one
df_simple = pd.concat([df_train, df_test, df_val], ignore_index=True)

# Check the shape
df_simple.shape


(20000, 2)

In [6]:
df_simple["label"].value_counts()


label
1    6761
0    5797
3    2709
4    2373
2    1641
5     719
Name: count, dtype: int64

In [1]:
import pandas as pd
df_rich = pd.read_csv("../data/rich/emotion_rich.csv")

In [8]:
# Get label ratio as percentages
rich_ratio = df_rich["label"].value_counts(normalize=True).sort_index() * 100
simple_ratio = df_simple["label"].value_counts(normalize=True).sort_index() * 100

# Combine for comparison
ratio_df = pd.DataFrame({
    "Simple Dataset (%)": simple_ratio.round(2),
    "Rich Dataset (%)": rich_ratio.round(2)
})

ratio_df


Unnamed: 0_level_0,Simple Dataset (%),Rich Dataset (%)
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,28.98,29.07
1,33.8,33.84
2,8.2,8.29
3,13.54,13.75
4,11.86,11.45
5,3.6,3.59


In [9]:
df_rich.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  416809 non-null  int64 
 1   text        416809 non-null  object
 2   label       416809 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 9.5+ MB


In [2]:
# Drop the extra index column
df_rich = df_rich.drop(columns=["Unnamed: 0"])

# Confirm it's gone
df_rich.head()


Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [11]:
df_rich["label"].value_counts()


label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [3]:
from datasets import Dataset

# Convert pandas DataFrame to HuggingFace Dataset
dataset = Dataset.from_pandas(df_rich)

# Check one example
dataset[0]


  from .autonotebook import tqdm as notebook_tqdm


{'text': 'i just feel really helpless and heavy hearted', 'label': 4}

In [4]:
from transformers import AutoTokenizer

# Load DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    
# Apply tokenization to entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Preview one tokenized example
tokenized_dataset[0]

Map: 100%|██████████| 416809/416809 [00:36<00:00, 11563.31 examples/s]


{'text': 'i just feel really helpless and heavy hearted',
 'label': 4,
 'input_ids': [101,
  1045,
  2074,
  2514,
  2428,
  13346,
  1998,
  3082,
  18627,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [5]:
# Split the tokenized dataset
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]


In [6]:
print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))


Train size: 375128
Validation size: 41681


In [6]:
from transformers import AutoModelForSequenceClassification

# Load DistilBERT for classification with 6 emotion labels
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=6
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import mlflow

# Explicitly tell MLflow where to save logs
mlflow.set_tracking_uri("file:///C:/Users/rabia/Documents/Pet_Projects/moodflow/mlruns")

# Start a new test run
with mlflow.start_run(run_name="distilbert_emotion_test"):
    mlflow.log_param("model", "distilbert-base-uncased")
    mlflow.log_metric("accuracy", 0.5)
    print("Run logged.")


Run logged.


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./models/distilbert_emotion",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=1
)


In [9]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


In [10]:
from transformers import Trainer

import mlflow

# Start MLflow run
with mlflow.start_run(run_name="distilbert_emotion_full"):

    # Log training config
    mlflow.log_param("model", "distilbert-base-uncased")
    mlflow.log_param("epochs", training_args.num_train_epochs)
    mlflow.log_param("batch_size", training_args.per_device_train_batch_size)
    mlflow.log_param("learning_rate", training_args.learning_rate)

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    mlflow.log_metrics({
        "val_accuracy": eval_results["eval_accuracy"],
        "val_f1": eval_results["eval_f1"]
    })

    print("Training + evaluation completed and logged.")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1261,0.088991,0.941652,0.941708
2,0.0859,0.087962,0.942684,0.943786


Training + evaluation completed and logged.


In [13]:
import mlflow

# Forcefully end any active run
mlflow.end_run()


🏃 View run capable-auk-759 at: http://127.0.0.1:5000/#/experiments/0/runs/3796da21982548d89c7800a0c75accce
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


In [14]:
# Set MLflow to connect to the running server
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Save the model locally first
model_save_path = "./models/distilbert_emotion_final"
trainer.save_model(model_save_path)

# Start a new run and log the model
with mlflow.start_run(run_name="distilbert_emotion_model_save"):
    mlflow.log_artifacts(model_save_path, artifact_path="distilbert_model")
    print("Model logged to MLflow!")


Model logged to MLflow!
🏃 View run distilbert_emotion_model_save at: http://127.0.0.1:5000/#/experiments/0/runs/34cf666cf3a04e0ea03fce6d3318ef68
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

# Load tokenizer and model
model_path = "./models/distilbert_emotion_final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Set model to evaluation mode
model.eval()

# Emotion labels (adjust if needed)
label_map = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# Function to predict
def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = int(torch.argmax(logits, dim=1))
    return label_map[predicted_class_id]

# Example usage
text = "I'm so excited for the party!"
predicted_emotion = predict_emotion(text)
print(f"Predicted Emotion: {predicted_emotion}")


Predicted Emotion: joy
