## Imports

In [1]:
import os
import re
import numpy as np

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

import gensim.downloader
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

  from .autonotebook import tqdm as notebook_tqdm


## Tokenizer

In [2]:
word_tokenize_pattern = re.compile(r"(?u)\b\w\w+\b")


def word_tokenize(s: str):
    return [x.lower() for x in word_tokenize_pattern.findall(s)]

## Metrics

In [3]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

EMOTIONS = ["Angry", "Happy", "Relaxed", "Sad"]

def print_results(gold_labels, predicted_labels):
    # overall
    p, r, f, _ = precision_recall_fscore_support(
        gold_labels, predicted_labels, average="macro", zero_division=0
    )
    acc = accuracy_score(gold_labels, predicted_labels)

    print("=== Overall (Macro Avg) ===")
    print("Precision:", p)
    print("Recall:", r)
    print("F1:", f)
    print("Accuracy:", acc)
    print()

    # Per-emotion metrics
    p_i, r_i, f_i, _ = precision_recall_fscore_support(
        gold_labels, predicted_labels, average=None, zero_division=0
    )

    print("=== Per Emotion (Class) Metrics ===")
    for i, emotion in enumerate(EMOTIONS):
        print(f"{emotion}:")
        print("  Precision:", p_i[i])
        print("  Recall:   ", r_i[i])
        print("  F1:       ", f_i[i])
    print()



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    p, r, f, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {"precision": p, "recall": r, "f1": f, "accuracy": acc}




## Dataset

In [4]:
DATASET_DIR = "NJU_MusicMood_v1.0"

label2id = {label: i for i, label in enumerate(EMOTIONS)}
id2label = {i: label for label, i in label2id.items()}

timestamp_pattern = re.compile(r"\[\d{2}:\d{2}(?:\.\d{2})?\]")


def clean_lyrics(text: str) -> str:
    # 1. Remove timestamps
    text = timestamp_pattern.sub("", text)

    # 2. Lowercase
    text = text.lower()

    # 3. Normalize quotes
    text = text.replace("’", "'").replace("“", '"').replace("”", '"')

    # 4. Strip whitespace from each line
    lines = [line.strip() for line in text.splitlines()]

    # 5. Remove empty lines
    lines = [line for line in lines if line]

    # 6. Collapse extra spaces inside lines
    lines = [re.sub(r"\s+", " ", line) for line in lines]

    return "\n".join(lines)


def get_lyrics(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()
    return clean_lyrics(raw)


def get_lyrics_and_labels(split: str):
    texts, labels = [], []
    for emotion in EMOTIONS:
        folder = os.path.join(DATASET_DIR, emotion, split)
        if not os.path.isdir(folder):
            continue

        for fname in os.listdir(folder):
            if not fname.endswith(".txt"):
                continue
            if fname.lower() == "info.txt":
                continue

            path = os.path.join(folder, fname)
            txt = get_lyrics(path)
            if txt.strip():
                texts.append(txt)
                labels.append(emotion)
    return texts, labels


# Load data 
train_texts, train_labels = get_lyrics_and_labels("Train")
dev_texts, dev_labels = get_lyrics_and_labels("Test")

assert len(train_texts) == len(train_labels)
assert len(dev_texts) == len(dev_labels)

# Datasets
train_ds = Dataset.from_dict(
    {"text": train_texts, "label": [label2id[l] for l in train_labels]}
)
dev_ds = Dataset.from_dict(
    {"text": dev_texts, "label": [label2id[l] for l in dev_labels]}
)


## Baseline: Bag of Words & Logistic Regression

In [5]:
print("=== Baseline: Bag of Words & Logistic Regression ===")

count_vectorizer = CountVectorizer(analyzer=word_tokenize, max_features=30000 )
train_counts = count_vectorizer.fit_transform(train_texts)
dev_counts = count_vectorizer.transform(dev_texts)

lr_bow = LogisticRegression(max_iter=2000,
    class_weight="balanced",
    solver="lbfgs",
    random_state=0)
lr_bow.fit(train_counts, train_labels)

lr_bow_dev_predictions = lr_bow.predict(dev_counts)
print_results(dev_labels, lr_bow_dev_predictions)

=== Baseline: Bag of Words & Logistic Regression ===
=== Overall (Macro Avg) ===
Precision: 0.37987563509974537
Recall: 0.37369068092033764
F1: 0.37382782926716535
Accuracy: 0.363395225464191

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.5
  Recall:    0.5070422535211268
  F1:        0.5034965034965035
Happy:
  Precision: 0.45121951219512196
  Recall:    0.3490566037735849
  F1:        0.39361702127659576
Relaxed:
  Precision: 0.3023255813953488
  Recall:    0.38613861386138615
  F1:        0.3391304347826087
Sad:
  Precision: 0.26595744680851063
  Recall:    0.25252525252525254
  F1:        0.25906735751295334



## Baseline: Tf-IDF & Logistic Regression

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    analyzer=word_tokenize,
    max_features=30000,        
    ngram_range=(1, 2),  #unigram and bigram     
    sublinear_tf=True,   # log-scaled term frequency 
)

# Fit TF-IDF on training text
train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
dev_tfidf = tfidf_vectorizer.transform(dev_texts)

# Logistic Regression classifier
lr_tfidf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    # multi_class="multinomial",
    solver="lbfgs",
    random_state=0
)

print("Training TF-IDF Logistic Regression model...")
lr_tfidf.fit(train_tfidf, train_labels)

# Predict on the dev set
lr_tfidf_predictions = lr_tfidf.predict(dev_tfidf)

# Evaluate performance
print_results(dev_labels, lr_tfidf_predictions)

Training TF-IDF Logistic Regression model...
=== Overall (Macro Avg) ===
Precision: 0.46542427022911215
Recall: 0.4838117016591375
F1: 0.47157415532186603
Accuracy: 0.46419098143236076

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.6309523809523809
  Recall:    0.7464788732394366
  F1:        0.6838709677419355
Happy:
  Precision: 0.4628099173553719
  Recall:    0.5283018867924528
  F1:        0.4933920704845815
Relaxed:
  Precision: 0.3875
  Recall:    0.3069306930693069
  F1:        0.3425414364640884
Sad:
  Precision: 0.3804347826086957
  Recall:    0.35353535353535354
  F1:        0.36649214659685864





## Baseline 2: Word2Vec & Logistic Regression

In [10]:
print("=== Word2Vec & Logistic Regression ===")

w2v_model = gensim.downloader.load("word2vec-google-news-300")
VECTOR_SIZE = w2v_model.vector_size


def vec_for_doc(tokenized_doc):
    vectors = [w2v_model[word] for word in tokenized_doc if word in w2v_model.key_to_index]
    if not vectors:
        return np.zeros(VECTOR_SIZE, dtype="float32")
    return np.mean(vectors, axis=0)


train_vecs = [vec_for_doc(word_tokenize(x)) for x in train_texts]
dev_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_texts]

lr_w2v = LogisticRegression(max_iter=500, random_state=0)
lr_w2v.fit(train_vecs, train_labels)

w2v_dev_predictions = lr_w2v.predict(dev_vecs)
print_results(dev_labels, w2v_dev_predictions)

=== Word2Vec & Logistic Regression ===
=== Overall (Macro Avg) ===
Precision: 0.4596175291565
Recall: 0.47568620468212114
F1: 0.45246239781207337
Accuracy: 0.4509283819628647

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.6063829787234043
  Recall:    0.8028169014084507
  F1:        0.6909090909090909
Happy:
  Precision: 0.47572815533980584
  Recall:    0.46226415094339623
  F1:        0.4688995215311005
Relaxed:
  Precision: 0.3308270676691729
  Recall:    0.43564356435643564
  F1:        0.37606837606837606
Sad:
  Precision: 0.425531914893617
  Recall:    0.20202020202020202
  F1:        0.273972602739726



## Most Frequent Class Baseline

In [7]:
import numpy as np
from collections import Counter

print("=== Baseline: Most Frequent Class (MFC) ===")

train_labels = train_ds["label"]
dev_labels = dev_ds["label"]

# Count frequency of each label
label_counts = Counter(train_labels)
most_frequent_label = label_counts.most_common(1)[0][0]

print("Most frequent class id:", most_frequent_label)

# Predict this label for all dev samples
mfc_predictions = [most_frequent_label] * len(dev_labels)

# Evaluate
print_results(dev_labels, mfc_predictions)


=== Baseline: Most Frequent Class (MFC) ===
Most frequent class id: 0
=== Overall (Macro Avg) ===
Precision: 0.047082228116710874
Recall: 0.25
F1: 0.07924107142857142
Accuracy: 0.1883289124668435

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.1883289124668435
  Recall:    1.0
  F1:        0.3169642857142857
Happy:
  Precision: 0.0
  Recall:    0.0
  F1:        0.0
Relaxed:
  Precision: 0.0
  Recall:    0.0
  F1:        0.0
Sad:
  Precision: 0.0
  Recall:    0.0
  F1:        0.0



## Benchmark Model Helpers

In [8]:
def tokenize_dataset(dataset, tokenizer, max_length: int = 256):
    def _tok(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )

    tokenized = dataset.map(_tok, batched=True)
    tokenized = tokenized.remove_columns(["text"])
    tokenized.set_format(type="torch")
    return tokenized


def train_and_eval_transformer(
    model_name: str,
    train_dataset: Dataset,
    dev_dataset: Dataset,
    output_dir: str,
    num_epochs: int,
    learning_rate: float,
    train_bs: int,
    eval_bs: int,
    set_pad_token_eos: bool = False,
):
    print(f"=== Fine-tuning {model_name} ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if set_pad_token_eos:
        tokenizer.pad_token = tokenizer.eos_token

    tokenized_train = tokenize_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenize_dataset(dev_dataset, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(EMOTIONS),
        label2id=label2id,
        id2label=id2label,
        ignore_mismatched_sizes=True,
    )

    if set_pad_token_eos:
        model.config.pad_token_id = tokenizer.eos_token_id

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=train_bs,
        per_device_eval_batch_size=eval_bs,
        weight_decay=0.01,
        eval_strategy="epoch",
        logging_steps=16,
        log_level="error",
        report_to="none",
        save_strategy="epoch",
        dataloader_pin_memory=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_dev,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    trainer.train()

    eval_results = trainer.evaluate()
    print(f"{model_name} dev results:", eval_results)

    pred_output = trainer.predict(tokenized_dev)
    logits = pred_output.predictions
    pred_ids = np.argmax(logits, axis=-1)
    pred_labels = [id2label[i] for i in pred_ids]

    print(f"{model_name} classification report:")
    print_results(dev_labels, pred_labels)

    return trainer, eval_results, pred_labels

## DistilGPT2

In [9]:
for i in range(5):
    print(train_ds[i])
    print("--------------")


{'text': 'what′s the difference of never knowing at all?\nwhen every step i take is always too small.\nmaybe it′s just something i can′t admit but lately,\ni feel like i don′t give a shit.\nmotivation such an aggravation,\naccusations don′t know how to take them.\ninspiration′s getting hard to fake it.\nconcentration′s never hard to brake it.\nsituation never what you want it to be.\nwhat′s the point of never making mistakes?\nself-indulgence is such a hard habit to brake.\nit′s all just a waste of time in the end.\ni don′t care so why should i even pretend.\nmotivation such an aggravation,\naccusations don′t know how to take them.\ninspiration′s getting hard to fake it.\nconcentration′s never hard to brake it.\nsituation never what you want it.\nnothing′s new, everything′s the same.\nit keeps on dragging me down, it′s getting kind of lame.\ni′m falling further behind, there′s nothing to explain.\nno matter what you say nothing ′s gonna change my mind.\ncan′t pretend on doubt until the

In [14]:

gpt2_trainer, gpt2_results, gpt2_pred_labels = train_and_eval_transformer(
    model_name="distilgpt2",
    train_dataset=train_ds,
    dev_dataset=dev_ds,
    output_dir="./distilgpt2_output",
    num_epochs=5,
    learning_rate=5e-5,
    train_bs=4,
    eval_bs=4,
    set_pad_token_eos=True,
)

=== Fine-tuning distilgpt2 ===


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:00<00:00, 5305.17 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 377/377 [00:00<00:00, 7706.77 examples/s]
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.4917,1.449112,0.46477,0.261909,0.103441,0.201592
2,1.4144,1.38622,0.405859,0.286774,0.191997,0.307692
3,1.0426,1.264766,0.468613,0.477137,0.456297,0.450928
4,0.561,1.643822,0.508392,0.524574,0.495061,0.501326
5,0.5868,1.638166,0.504491,0.506339,0.504548,0.488064


distilgpt2 dev results: {'eval_loss': 1.638166069984436, 'eval_precision': 0.5044906825430081, 'eval_recall': 0.50633895438441, 'eval_f1': 0.5045481885676589, 'eval_accuracy': 0.4880636604774536, 'eval_runtime': 12.3601, 'eval_samples_per_second': 30.501, 'eval_steps_per_second': 7.686, 'epoch': 5.0}
distilgpt2 classification report:


ValueError: Mix of label input types (string and number)

## Distilbert

In [10]:
distilbert_trainer, distilbert_results, distilbert_pred_labels = train_and_eval_transformer(
    model_name="distilbert/distilbert-base-uncased",
    train_dataset=train_ds,
    dev_dataset=dev_ds,
    output_dir="./distilbert_musicmood",
    num_epochs=3,
    learning_rate=5e-5,
    train_bs=4,
    eval_bs=4,
    set_pad_token_eos=False,
)

=== Fine-tuning distilbert/distilbert-base-uncased ===


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:00<00:00, 4996.31 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 377/377 [00:00<00:00, 7422.43 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.2941,1.287842,0.428011,0.414448,0.336632,0.374005
2,0.809,1.338264,0.413083,0.485278,0.412546,0.450928
3,0.6418,1.166022,0.557673,0.559182,0.557949,0.541114


distilbert/distilbert-base-uncased dev results: {'eval_loss': 1.1660218238830566, 'eval_precision': 0.5576731819849255, 'eval_recall': 0.5591815625888924, 'eval_f1': 0.5579492858240676, 'eval_accuracy': 0.5411140583554377, 'eval_runtime': 9.3142, 'eval_samples_per_second': 40.476, 'eval_steps_per_second': 10.199, 'epoch': 3.0}
distilbert/distilbert-base-uncased classification report:


ValueError: Mix of label input types (string and number)