Install requirements and import dataset

In [8]:
pip install torch transformers datasets scikit-learn faiss-cpu pandas tqdm



In [9]:
from datasets import load_dataset

dataset = load_dataset("go_emotions")
print(dataset["train"][0])


{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [27], 'id': 'eebbqej'}


In [10]:
from transformers import AutoTokenizer
from datasets import Sequence, Value

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
label_names = dataset['train'].features['labels'].feature.names

num_labels = len(label_names)



def tokenize_and_format(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

    label_vec = [0.0] * num_labels
    for idx in example["labels"]:
          label_vec[idx] = 1.0

    tokens["labels"] = list(map(float, label_vec))
    return tokens


encoded_dataset = dataset.map(tokenize_and_format, batched=False)


encoded_dataset = encoded_dataset.cast_column("labels", Sequence(Value("float32")))


print(encoded_dataset["train"][0]["labels"])
print(type(encoded_dataset["train"][0]["labels"][0]))


Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5426 [00:00<?, ? examples/s]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
<class 'float'>


Finetuning distill bert

In [11]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
import torch

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs >= 0.5).astype(int)

    return {
        "micro/f1": f1_score(labels, preds, average="micro", zero_division=0),
        "macro/f1": f1_score(labels, preds, average="macro", zero_division=0),
        "micro/precision": precision_score(labels, preds, average="micro", zero_division=0),
        "micro/recall": recall_score(labels, preds, average="micro", zero_division=0),
    }


In [13]:
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)



training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=8,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marokiaraj-roshan[0m ([33marokiaraj-roshan-georgia-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Micro/f1,Macro/f1,Micro/precision,Micro/recall
1,0.0929,0.088111,0.553776,0.360789,0.700216,0.457994
2,0.078,0.083566,0.579175,0.430772,0.700067,0.493887
3,0.0605,0.087789,0.572421,0.453385,0.662477,0.503918
4,0.0475,0.098557,0.561761,0.468812,0.61938,0.51395
5,0.0329,0.110791,0.562136,0.477443,0.588811,0.537774
6,0.0224,0.120749,0.569742,0.470208,0.588442,0.552194
7,0.0167,0.128503,0.56669,0.486516,0.571065,0.562382
8,0.0124,0.132278,0.568555,0.482455,0.570033,0.567085


TrainOutput(global_step=21712, training_loss=0.04839738338153505, metrics={'train_runtime': 2215.4209, 'train_samples_per_second': 156.756, 'train_steps_per_second': 9.8, 'total_flos': 2.284863444688896e+16, 'train_loss': 0.04839738338153505, 'epoch': 8.0})

Validate

In [14]:
from sklearn.metrics import classification_report
import torch
import numpy as np

def evaluate_on_dataset(trainer, dataset, label_names, threshold=0.5):
    preds_output = trainer.predict(dataset)
    probs = torch.sigmoid(torch.tensor(preds_output.predictions)).numpy()
    preds = (probs >= threshold).astype(int)
    labels = preds_output.label_ids

    print("Classification Report:")
    print(classification_report(labels, preds, target_names=label_names, zero_division=0))

evaluate_on_dataset(trainer, encoded_dataset["validation"], label_names, threshold=0.3)


Classification Report:
                precision    recall  f1-score   support

    admiration       0.66      0.78      0.72       488
     amusement       0.70      0.86      0.77       303
         anger       0.48      0.50      0.49       195
     annoyance       0.30      0.45      0.36       303
      approval       0.33      0.37      0.35       397
        caring       0.42      0.52      0.46       153
     confusion       0.36      0.43      0.39       152
     curiosity       0.44      0.62      0.51       248
        desire       0.42      0.51      0.46        77
disappointment       0.32      0.36      0.34       163
   disapproval       0.38      0.40      0.39       292
       disgust       0.41      0.52      0.45        97
 embarrassment       0.44      0.49      0.46        35
    excitement       0.28      0.30      0.29        96
          fear       0.69      0.59      0.63        90
     gratitude       0.88      0.91      0.89       358
         grief       0.0

In [15]:
  import torch

  def predict_emotions(trainer, text, tokenizer, label_names, threshold=0.3):
      inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

      device = trainer.model.device
      inputs = {key: value.to(device) for key, value in inputs.items()}
      with torch.no_grad():
          logits = trainer.model(**inputs).logits
      probs = torch.sigmoid(logits).squeeze().cpu().numpy()
      preds = (probs >= threshold).astype(int)
      print(f"Text: {text}")
      print("Label Predictions:")
      for label_name, probability, pred in zip(label_names, probs, preds):
          print(f"  • {label_name}: probability={probability:.3f}, predicted={pred}")

      return probs, preds


  input_txt = "She broke up with me."
  probs, preds = predict_emotions(trainer, input_txt, tokenizer, label_names)
  print(probs)
  print(preds)

Text: She broke up with me.
Label Predictions:
  • admiration: probability=0.000, predicted=0
  • amusement: probability=0.003, predicted=0
  • anger: probability=0.010, predicted=0
  • annoyance: probability=0.002, predicted=0
  • approval: probability=0.003, predicted=0
  • caring: probability=0.001, predicted=0
  • confusion: probability=0.000, predicted=0
  • curiosity: probability=0.000, predicted=0
  • desire: probability=0.001, predicted=0
  • disappointment: probability=0.002, predicted=0
  • disapproval: probability=0.001, predicted=0
  • disgust: probability=0.001, predicted=0
  • embarrassment: probability=0.000, predicted=0
  • excitement: probability=0.000, predicted=0
  • fear: probability=0.001, predicted=0
  • gratitude: probability=0.000, predicted=0
  • grief: probability=0.054, predicted=0
  • joy: probability=0.002, predicted=0
  • love: probability=0.002, predicted=0
  • nervousness: probability=0.001, predicted=0
  • optimism: probability=0.001, predicted=0
  • pr

In [16]:
trainer.save_model("./my_saved_model")


In [50]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load fine-tuned model
model_path = "./my_saved_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Emotion classification pipeline
emotion_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)


Device set to use cuda:0


In [51]:
import pandas as pd
import numpy as np
import faiss
import json
from tqdm import tqdm
from sklearn.preprocessing import normalize

df = pd.read_csv("spotify_millsongdata.csv")

def get_emotion_vector(lyrics):
    try:
        result = emotion_pipeline(lyrics[:512])
        return [s['score'] for s in result[0]]
    except:
        return [0.0] * model.config.num_labels

tqdm.pandas()
df['emotion_vector'] = df['text'].progress_apply(get_emotion_vector)
df['emotion_vector'] = df['emotion_vector'].apply(lambda x: normalize([x])[0])
emotion_matrix = np.vstack(df['emotion_vector'].values).astype('float32')
index = faiss.IndexFlatL2(emotion_matrix.shape[1])
index.add(emotion_matrix)
metadata = df[['artist', 'song', 'link']].to_dict('records')
faiss.write_index(index, "emotion_index.faiss")
with open("emotion_metadata.jsonl", "w") as f:
    for record in metadata:
        f.write(json.dumps(record) + "\n")

100%|██████████| 57650/57650 [08:45<00:00, 109.76it/s]


In [52]:
index = faiss.read_index("emotion_index.faiss")
with open("emotion_metadata.jsonl", "r") as f:
    metadata = [json.loads(line) for line in f]

def get_user_emotion_vector(text):
    result = emotion_pipeline(text[:512])
    vec = np.array([s['score'] for s in result[0]], dtype=np.float32)
    return normalize([vec])[0]

def recommend_songs_from_emotion(input_text, k):
    user_vec = get_user_emotion_vector(input_text)
    D, I = index.search(np.array([user_vec], dtype='float32'), k)
    return [metadata[i] for i in I[0]]

In [58]:
from operator import ge
input_text = "I am feeling lonely and sad"
print("Input text: ", input_text)
print('User emotion vector: ', get_user_emotion_vector(input_text))
top_k_songs = recommend_songs_from_emotion(input_text, k=5) #top 5
print("Model recommendations:")
for song in top_k_songs:
    print(f"{song['artist']} - {song['song']}")


Input text:  I am feeling lonely and sad
User emotion vector:  [7.56489931e-04 3.18489260e-03 3.20012545e-03 3.13403467e-03
 4.70045894e-04 1.70166086e-03 2.09417699e-03 3.27906489e-03
 1.95309343e-03 2.85906417e-02 6.83688599e-04 2.06160471e-03
 1.26426074e-03 9.00112760e-04 2.54524501e-03 6.61055557e-04
 1.66011296e-02 2.25029859e-03 1.00298428e-03 1.04666535e-02
 1.00110788e-03 5.17650235e-05 1.18957229e-03 2.40984737e-04
 2.30497563e-03 9.99352496e-01 8.17966074e-04 3.25534318e-03]
Model recommendations:
Eric Clapton - I Looked Away
Marianne Faithfull - Lonesome Traveller
Elvis Presley - Heartbreak Hotel
Status Quo - I Fought The Law
Bob Rivers - 12 Pains Of Christmas


In [54]:
#baseline cos sim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = tfidf.fit_transform(df['text'])

def recommend_tfidf_lyrics(input_text, k):
    input_vec = tfidf.transform([input_text])
    similarities = cosine_similarity(input_vec, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[::-1][:k]
    return df.iloc[top_indices][['artist', 'song', 'link']].to_dict('records')


In [59]:
# baseline output
user_input = "I am feeling lonely and sad"
print("Input text: ", input_text)
print('User emotion vector: ', get_user_emotion_vector(input_text))
tfidf_results = recommend_tfidf_lyrics(user_input, k=5)
print("TF-IDF recommendations:")
for song in tfidf_results:
    print(f"{song['artist']} - {song['song']}")

Input text:  I am feeling lonely and sad
User emotion vector:  [7.56489931e-04 3.18489260e-03 3.20012545e-03 3.13403467e-03
 4.70045894e-04 1.70166086e-03 2.09417699e-03 3.27906489e-03
 1.95309343e-03 2.85906417e-02 6.83688599e-04 2.06160471e-03
 1.26426074e-03 9.00112760e-04 2.54524501e-03 6.61055557e-04
 1.66011296e-02 2.25029859e-03 1.00298428e-03 1.04666535e-02
 1.00110788e-03 5.17650235e-05 1.18957229e-03 2.40984737e-04
 2.30497563e-03 9.99352496e-01 8.17966074e-04 3.25534318e-03]
TF-IDF recommendations:
Ocean Colour Scene - So Sad - The Riverboat Song B - Side
Tom Waits - Lonely
Roy Orbison - Only Alive
Ne-Yo - Lonely Again
Eddie Cochran - Lonely
