In [19]:
import os
import re

# --- Tokenizer ---
word_tokenize_pattern = re.compile(r"(?u)\b\w\w+\b")
def word_tokenize(s):
    return [x.lower() for x in word_tokenize_pattern.findall(s)]


# --- Metrics / print_results ---
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def print_results(gold_labels, predicted_labels):
    p, r, f, _ = precision_recall_fscore_support(gold_labels, 
                                                predicted_labels, 
                                                average='macro', 
                                                zero_division=0
    )
    acc = accuracy_score(gold_labels, predicted_labels)
    print("Precision: ", p)
    print("Recall: ", r)
    print("F1: ", f)
    print("Accuracy: ", acc)
    print()



DATASET_DIR = "NJU_MusicMood_v1.0"   
EMOTIONS = ["Angry", "Happy", "Relaxed", "Sad"]  


# --- Get Lyrics
def get_lyrics(path):
        with open(path, "r", encoding="utf-8") as f:
            return f.read()
        
# --- get Lyrics and emotions
def get_lyrics_and_labels(split: str):
    texts, labels = [], []
    for emotion in EMOTIONS:
        folder = os.path.join(DATASET_DIR, emotion, split)
        if not os.path.isdir(folder):
            continue
        for fname in os.listdir(folder):
            if not fname.endswith(".txt"):
                continue
            if fname.lower() == "info.txt":  
                continue
            path = os.path.join(folder, fname)
            txt = get_lyrics(path)
            if txt.strip():
                texts.append(txt)
                labels.append(emotion)   
    return texts, labels

# Training and testing
train_texts, train_labels = get_lyrics_and_labels("Train")
dev_texts, dev_labels     = get_lyrics_and_labels("Test")

# Sanity checks
assert len(train_texts) == len(train_labels)
assert len(dev_texts) == len(dev_labels)

# --- Copy paste from a2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

count_vectorizer = CountVectorizer(analyzer=word_tokenize)

train_counts = count_vectorizer.fit_transform(train_texts)
dev_counts   = count_vectorizer.transform(dev_texts)

lr = LogisticRegression(max_iter=500, random_state=0)
lr_classifier = lr.fit(train_counts, train_labels)

lr_dev_predictions = lr_classifier.predict(dev_counts)

# Print results 
print_results(dev_labels, lr_dev_predictions)


Precision:  0.40222447226313207
Recall:  0.39598406425813987
F1:  0.3975455976171156
Accuracy:  0.3952254641909814



In [20]:
import gensim.downloader
model = gensim.downloader.load('word2vec-google-news-300')

def vec_for_doc(tokenized_doc):
   
    available_vectors = []
    vector_size = model.vector_size
    empty_vector = [0.0] * vector_size
 
    for token in tokenized_doc:
        if token in model.key_to_index:
            available_vectors.append(model[token])
            
    if not available_vectors:
        return empty_vector

    num_words = len(available_vectors)
    summed_vector = empty_vector
    for vec in available_vectors:
        for i in range(vector_size):
            summed_vector[i] += vec[i]
    average_vector = [val / num_words for val in summed_vector]
    return average_vector

In [21]:
word_tokenize_pattern = re.compile(r"(?u)\b\w\w+\b")
def word_tokenize(s, apply_case_folding=True):
    return [x.lower() for x in word_tokenize_pattern.findall(s)]

train_vecs = [vec_for_doc(word_tokenize(x)) for x in train_texts]
dev_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_texts]

# Train logistic regression, same as A2
lr = LogisticRegression(max_iter=500,
                        random_state=0)
clf = lr.fit(train_vecs, train_labels)
dev_predictions = clf.predict(dev_vecs)

print_results(dev_labels, dev_predictions)

Precision:  0.4596175291565
Recall:  0.47568620468212114
F1:  0.45246239781207337
Accuracy:  0.4509283819628647



In [23]:
# Benchmark 1 - Fine-Tune DistilGPT-2 for Emotion
from datasets import Dataset

label2id = {label: i for i, label in enumerate(EMOTIONS)}
id2label = {i: label for label, i in label2id.items()}

train_ds = Dataset.from_dict({
    "text": train_texts,
    "label": [label2id[l] for l in train_labels]
})

dev_ds = Dataset.from_dict({
    "text": dev_texts,
    "label": [label2id[l] for l in dev_labels]
})


In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(EMOTIONS),
    label2id=label2id,
    id2label=id2label
)
model.config.pad_token_id = tokenizer.eos_token_id


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256  # you can try 512 later if you want
    )

train_ds = train_ds.map(tokenize, batched=True)
dev_ds   = dev_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["text"])
dev_ds   = dev_ds.remove_columns(["text"])

train_ds.set_format(type="torch")
dev_ds.set_format(type="torch")


Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:00<00:00, 1957.24 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 377/377 [00:00<00:00, 4142.64 examples/s]


In [14]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    p, r, f, _ = precision_recall_fscore_support(
        labels,
        preds,
        average="macro",
        zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        "precision": p,
        "recall": r,
        "f1": f,
        "accuracy": acc
    }

training_args = TrainingArguments(
    output_dir="./distilgpt2_output",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10,
    report_to="none",  # disable wandb etc
    dataloader_pin_memory=False

)


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    compute_metrics=compute_metrics
)

trainer.train()


Step,Training Loss
10,0.1675
20,0.0378
30,0.1615
40,0.5951
50,0.1746
60,0.0659
70,0.1582
80,0.1926
90,0.0262
100,0.1654


TrainOutput(global_step=500, training_loss=0.08047679496649653, metrics={'train_runtime': 211.9495, 'train_samples_per_second': 9.436, 'train_steps_per_second': 2.359, 'total_flos': 130657812480000.0, 'train_loss': 0.08047679496649653, 'epoch': 5.0})

In [18]:
results = trainer.evaluate()
print("DistilGPT-2 dev results:", results)

raw_pred = trainer.predict(dev_ds)
logits = raw_pred.predictions
pred_ids = logits.argmax(axis=-1)

distilgpt2_dev_predictions = [id2label[i] for i in pred_ids]
distilgpt2_dev_gold = dev_labels  # already string labels

print("DistilGPT-2 classification report:")
print_results(distilgpt2_dev_gold, distilgpt2_dev_predictions)


DistilGPT-2 dev results: {'eval_loss': 3.6030869483947754, 'eval_precision': 0.43744864493461966, 'eval_recall': 0.4627188303470262, 'eval_f1': 0.44662185643877167, 'eval_accuracy': 0.4376657824933687, 'eval_runtime': 12.1209, 'eval_samples_per_second': 31.103, 'eval_steps_per_second': 7.838, 'epoch': 5.0}
DistilGPT-2 classification report:
Precision:  0.43744864493461966
Recall:  0.4627188303470262
F1:  0.44662185643877167
Accuracy:  0.4376657824933687

