In [8]:
import os
import re
from datasets import Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [9]:

############################################################
# 1. Setup and constants
############################################################

EMOTIONS = ["Angry", "Happy", "Relaxed", "Sad"]

label2id = {label: i for i, label in enumerate(EMOTIONS)}
id2label = {i: label for label, i in label2id.items()}

# Timestamp pattern like [01:23.45]
timestamp_pattern = re.compile(r"\[\d{2}:\d{2}(?:\.\d{2})?\]")

DATASET_DIR = "NJU_MusicMood_v1.0"

############################################################
# 2. Cleaning function (clean text for modeling)
############################################################

def clean_lyrics(text: str) -> str:
    # Remove timestamps like [00:29]
    text = timestamp_pattern.sub("", text)

    # Lowercase
    text = text.lower()

    # Normalize quotes
    text = text.replace("’", "'").replace("“", '"').replace("”", '"')

    # Remove ellipses and repeated dots
    text = re.sub(r"\.{2,}", " ", text)

    # Remove long underscores
    text = re.sub(r"_{2,}", " ", text)

    # Remove trailing "end" markers
    text = re.sub(r"\bend[.\s]*$", "", text.strip())

    # # Replace newlines with space
    # text = text.replace("\n", " ")

    # Remove special characters except letters, digits, spaces, apostrophes
    text = re.sub(r"[^a-z0-9' ]+", " ", text)

    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text)

    return text.strip()


############################################################
# 3. Load dataset
############################################################

def get_lyrics(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()
    return clean_lyrics(raw)

def get_lyrics_and_labels(split: str):
    texts, labels = [], []
    for emotion in EMOTIONS:
        folder = os.path.join(DATASET_DIR, emotion, split)

        if not os.path.isdir(folder):
            continue

        for fname in os.listdir(folder):
            if fname.lower() == "info.txt":
                continue
            if not fname.endswith(".txt"):
                continue

            path = os.path.join(folder, fname)
            text = get_lyrics(path)

            if text.strip():
                texts.append(text)
                labels.append(emotion)

    return texts, labels

train_texts, train_labels = get_lyrics_and_labels("Train")
dev_texts, dev_labels = get_lyrics_and_labels("Test")

train_ds = Dataset.from_dict({
    "text": train_texts,
    "label": [label2id[l] for l in train_labels]
})

dev_ds = Dataset.from_dict({
    "text": dev_texts,
    "label": [label2id[l] for l in dev_labels]
})




In [10]:
############################################################
# 4. Helper for evaluation
############################################################

def print_results(true_labels, predicted_labels):
    p, r, f, _ = precision_recall_fscore_support(
        true_labels, predicted_labels, average="macro", zero_division=0
    )
    acc = accuracy_score(true_labels, predicted_labels)

    print("Macro Precision:", p)
    print("Macro Recall:", r)
    print("Macro F1:", f)
    print("Accuracy:", acc)


############################################################
# 5. Position tagging function (START, MID, END)
############################################################

def position_tag(text):
    """Tag each token with its position in the song.
       First 20 percent = _START
       Middle 60 percent = _MID
       Last 20 percent = _END
    """
    tokens = text.split()
    n = len(tokens)
    tagged = []

    for i, tok in enumerate(tokens):
        ratio = i / n
        if ratio < 0.2:
            tagged.append(tok + "_START")
        elif ratio > 0.8:
            tagged.append(tok + "_END")
        else:
            tagged.append(tok + "_MID")

    return " ".join(tagged)

In [11]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def print_results(gold_labels, predicted_labels):
    # Overall macro metrics
    p, r, f, _ = precision_recall_fscore_support(
        gold_labels, predicted_labels, average="macro", zero_division=0
    )
    acc = accuracy_score(gold_labels, predicted_labels)

    print("=== Overall (Macro Avg) ===")
    print("Precision:", p)
    print("Recall:", r)
    print("F1:", f)
    print("Accuracy:", acc)
    print()

    # Per class metrics
    p_i, r_i, f_i, _ = precision_recall_fscore_support(
        gold_labels, predicted_labels, average=None, zero_division=0
    )

    print("=== Per Emotion (Class) Metrics ===")
    for i, emotion in enumerate(EMOTIONS):
        print(f"{emotion}:")
        print("  Precision:", p_i[i])
        print("  Recall:   ", r_i[i])
        print("  F1:       ", f_i[i])
    print()  # empty line at the end



In [12]:
############################################################
# 6. Vectorize with TF-IDF + train Logistic Regression
############################################################

print("=== Training Logistic Regression with Position Tags (START/MID/END) ===")

vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2)           # unigrams + bigrams improve performance
)
train_tagged = [position_tag(t) for t in train_texts]
dev_tagged = [position_tag(t) for t in dev_texts]
X_train = vectorizer.fit_transform(train_tagged)
X_dev = vectorizer.transform(dev_tagged)

clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    multi_class="multinomial",
    solver="lbfgs"
)

clf.fit(X_train, train_ds["label"])

preds = clf.predict(X_dev)

############################################################
# 7. Print results
############################################################

print("\n=== Results with START/MID/END position tagging ===")
print_results(dev_ds["label"], preds)

=== Training Logistic Regression with Position Tags (START/MID/END) ===

=== Results with START/MID/END position tagging ===
=== Overall (Macro Avg) ===
Precision: 0.3902636986855047
Recall: 0.40970226440661606
F1: 0.39418263460750874
Accuracy: 0.38992042440318303

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.45544554455445546
  Recall:    0.647887323943662
  F1:        0.5348837209302325
Happy:
  Precision: 0.40963855421686746
  Recall:    0.32075471698113206
  F1:        0.35978835978835977
Relaxed:
  Precision: 0.3626373626373626
  Recall:    0.32673267326732675
  F1:        0.34375
Sad:
  Precision: 0.3333333333333333
  Recall:    0.3434343434343434
  F1:        0.3383084577114428





In [14]:
############################################################
# 6. Vectorize with TF-IDF + train Logistic Regression
############################################################

print("=== Training Logistic Regression with Position Tags (START/MID/END) ===")

vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2)           # unigrams + bigrams improve performance
)

X_train = vectorizer.fit_transform(train_tagged)
X_dev = vectorizer.transform(dev_tagged)

clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    # multi_class="multinomial",
    solver="lbfgs"
)

clf.fit(X_train, train_ds["label"])

preds = clf.predict(X_dev)

############################################################
# 7. Print emotion based results
############################################################

print("\n=== Emotion based results with START/MID/END position tagging ===")
print_results(dev_ds["label"], preds)


=== Training Logistic Regression with Position Tags (START/MID/END) ===

=== Emotion based results with START/MID/END position tagging ===
=== Overall (Macro Avg) ===
Precision: 0.3902636986855047
Recall: 0.40970226440661606
F1: 0.39418263460750874
Accuracy: 0.38992042440318303

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.45544554455445546
  Recall:    0.647887323943662
  F1:        0.5348837209302325
Happy:
  Precision: 0.40963855421686746
  Recall:    0.32075471698113206
  F1:        0.35978835978835977
Relaxed:
  Precision: 0.3626373626373626
  Recall:    0.32673267326732675
  F1:        0.34375
Sad:
  Precision: 0.3333333333333333
  Recall:    0.3434343434343434
  F1:        0.3383084577114428



## Separate Models

In [15]:
def get_segment(text, segment="start", portion=0.3):
    """
    Extracts a portion of the lyrics.
    portion=0.3 means 30 percent of lyrics.
    segment can be start, middle, or end.
    """
    tokens = text.split()
    n = len(tokens)

    if n == 0:
        return ""

    cut = int(n * portion)  # number of tokens per section

    if segment == "start":
        return " ".join(tokens[:cut])

    elif segment == "middle":
        start = int(n * 0.35)
        end = int(n * 0.65)
        return " ".join(tokens[start:end])

    elif segment == "end":
        return " ".join(tokens[-cut:])

    else:
        return text


In [16]:
# Build segmented datasets
train_start = [get_segment(t, "start") for t in train_ds["text"]]
train_middle = [get_segment(t, "middle") for t in train_ds["text"]]
train_end = [get_segment(t, "end") for t in train_ds["text"]]

dev_start = [get_segment(t, "start") for t in dev_ds["text"]]
dev_middle = [get_segment(t, "middle") for t in dev_ds["text"]]
dev_end = [get_segment(t, "end") for t in dev_ds["text"]]


In [17]:
def train_segment_model(train_texts, dev_texts, train_labels, dev_labels, name=""):
    print(f"\n=== Training {name} model ===")

    vectorizer = TfidfVectorizer(
        max_features=50000,
        ngram_range=(1, 2)
    )

    X_train = vectorizer.fit_transform(train_texts)
    X_dev = vectorizer.transform(dev_texts)

    clf = LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        multi_class="multinomial"
    )

    clf.fit(X_train, train_labels)
    preds = clf.predict(X_dev)

    print(f"=== Results ({name}) ===")
    print_results(dev_labels, preds)


In [18]:
# Train and evaluate beginning model
train_segment_model(
    train_start,
    dev_start,
    train_ds["label"],
    dev_ds["label"],
    name="BEGINNING ONLY"
)

# Train and evaluate middle model
train_segment_model(
    train_middle,
    dev_middle,
    train_ds["label"],
    dev_ds["label"],
    name="MIDDLE ONLY"
)

# Train and evaluate end model
train_segment_model(
    train_end,
    dev_end,
    train_ds["label"],
    dev_ds["label"],
    name="END ONLY"
)



=== Training BEGINNING ONLY model ===
=== Results (BEGINNING ONLY) ===
=== Overall (Macro Avg) ===
Precision: 0.3458269076705815
Recall: 0.3602639861381833
F1: 0.3481884320377146
Accuracy: 0.3421750663129973

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.4375
  Recall:    0.5915492957746479
  F1:        0.5029940119760479
Happy:
  Precision: 0.4157303370786517
  Recall:    0.3490566037735849
  F1:        0.37948717948717947
Relaxed:
  Precision: 0.2911392405063291
  Recall:    0.22772277227722773
  F1:        0.25555555555555554
Sad:
  Precision: 0.23893805309734514
  Recall:    0.2727272727272727
  F1:        0.25471698113207547


=== Training MIDDLE ONLY model ===




=== Results (MIDDLE ONLY) ===
=== Overall (Macro Avg) ===
Precision: 0.4168987689585285
Recall: 0.43011603844413143
F1: 0.4214992552389918
Accuracy: 0.41644562334217505

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.48863636363636365
  Recall:    0.6056338028169014
  F1:        0.5408805031446541
Happy:
  Precision: 0.46938775510204084
  Recall:    0.4339622641509434
  F1:        0.45098039215686275
Relaxed:
  Precision: 0.3333333333333333
  Recall:    0.297029702970297
  F1:        0.31413612565445026
Sad:
  Precision: 0.37623762376237624
  Recall:    0.3838383838383838
  F1:        0.38


=== Training END ONLY model ===
=== Results (END ONLY) ===
=== Overall (Macro Avg) ===
Precision: 0.34643808610400684
Recall: 0.36406859195087726
F1: 0.3458654058052186
Accuracy: 0.34748010610079577

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.3416666666666667
  Recall:    0.5774647887323944
  F1:        0.4293193717277487
Happy:
  Precision: 0.40217391304347827
  Recall:  

