In [26]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import scipy.stats as st

In [27]:
DATA_DIR = "data"

# Load data

In [28]:
# lyrics data path
lyrics_path = os.path.join(DATA_DIR, "lyrics-data.csv")
df_lyrics = pd.read_csv(lyrics_path)

# artists data path
artists_path = os.path.join(DATA_DIR, "artists-data.csv")
df_artists = pd.read_csv(artists_path)

## Remove non english songs

In [29]:
# Filter for language en = english
df_lyrics_en = df_lyrics[df_lyrics["language"] == "en"]

## Remove songs with less than 70 and more than 1000 words

In [30]:
# Split each string column and count length
df_lyrics_en = df_lyrics_en[
    df_lyrics_en["Lyric"].apply(lambda lyric: (len(lyric.split()) >= 70) & (len(lyric.split()) <= 1000))
]

## Merge songs and artists

In [31]:
# Merge artists and lyrics
df_songs = pd.merge(
    df_lyrics_en, 
    df_artists, 
    how="inner",
    left_on="ALink",
    right_on="Link"
)

# Remove duplicate columns
del df_songs["Link"]

## Remove columns with duplicate or redundant information

In [32]:
del df_songs["ALink"]
del df_songs["language"]
del df_songs["Songs"]
del df_songs["Popularity"]

## Remove duplicate rows

In [33]:
df_songs_no_duplicates = df_songs.drop_duplicates(subset=['SLink'])

## Add labels from Last.fm

In [34]:
tags_df= pd.read_csv('./data/tags.csv')

In [35]:
df = pd.merge(df_songs_no_duplicates, tags_df,  how='left', left_on=["Artist","SName"], right_on = ["Artist","SName"])

In [36]:
df = df[(df["Tags"] != "NoTagsFound") & (df["Tags"] != "NoSongFound")]

In [37]:
df.head()

Unnamed: 0,SName,SLink,Lyric,Artist,Genres,Tags
3,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",Ivete Sangalo,Pop; Axé; Romântico,"pop, female vocalists, rnb, hot, spanish, soul..."
5,Human Nature,/ivete-sangalo/human-nature.html,Looking out\nAcross the night time\nThe city w...,Ivete Sangalo,Pop; Axé; Romântico,"pop, rock, female vocalists, dance, latin, rnb..."
9,Natural Collie,/ivete-sangalo/natural-collie.html,Been down in the valley\nSmoking natural colli...,Ivete Sangalo,Pop; Axé; Romântico,"spanish, electronic, female, jazz, hip hop, po..."
10,Where It Begins (feat. Nelly Furtado),/ivete-sangalo/where-it-begins-feat-nelly-furt...,"When you're alone and you don't know how,\nTo ...",Ivete Sangalo,Pop; Axé; Romântico,"pop, singer-songwriter, rnb, Nelly Furtado, sp..."
15,Lazy Groove,/claudia-leitte/lazy-groove.html,Are you ready to dance?\nTo make your body cra...,Claudia Leitte,Pop; Axé; Romântico,"dance, cool, 2013, axe music, claudia leitte"


## Mood ground truth

In [38]:
MOOD_CATEGORIES = {
    'calm': ['calm', 'comfort', 'quiet', 'serene', 'mellow', 'chill out'],
    'sad': ['sadness', 'unhappy', 'melancholic', 'melancholy'],
    'happy': ['happy', 'happiness', 'happy songs', 'happy music'],
    'romantic': ['romantic', 'romantic music'],
    'upbeat': ['upbeat', 'gleeful', 'high spirits', 'zest', 'enthusiastic'],
    'depressed': ['depressed', 'blue', 'dark', 'depressive', 'dreary'],
    'anger': ['anger', 'angry', 'choleric', 'fury', 'outraged', 'rage'],
    'grief': ['grief', 'heartbreak', 'mournful', 'sorrow', 'sorry'],
    'dreamy': ['dreamy'],
    'cheerful': ['cheerful', 'cheer up', 'festive', 'jolly', 'jovial', 'merry'],
    'brooding': ['brooding', 'contemplative', 'meditative', 'reflective'],
    'aggression': ['aggression', 'aggressive'],
    'confident': ['confident', 'encouraging', 'encouragement', 'optimism'],
    'angst': ['angst', 'anxiety', 'anxious', 'jumpy', 'nervous', 'angsty'],
    'earnest': ['earnest', 'heartfelt'],
    'desire': ['desire', 'hope', 'hopeful', 'mood: hopeful'],
    'pessimism': ['pessimism', 'cynical', 'pessimistic', 'weltschmerz'],
    'excitement': ['excitement', 'exciting', 'exhilarating', 'thrill', 'ardor']
}

In [39]:
def get_mood_from_tags(tags):
    """Get mood from tags. If tags does not contain tags mapped to moods then mood is None.

    :param tags: tags of the song.
    :ptype: String
    
    :return: Mood for given tags.
    :rtype: String or None
    """
    
    moods = list(MOOD_CATEGORIES.keys())
    
    # initialize mood_keyword_count_dict
    mood_keyword_count_dict = dict(zip(moods, [0]*len(MOOD_CATEGORIES)))
    
    # create tags list
    if isinstance(tags, str) and tags not in ["NoTagsFound", "NoSongFound"]:
        tags = tags.split(", ")
    else:
        # if no tags available then return None 
        return None

    # compute mood count for all moods based on if tags contains tags mapped to these moods
    for tag in tags:
        for mood in moods:
            # TODO: Advance to check also if keyword is contained in the tag (to get more moods assigned)
            if tag in MOOD_CATEGORIES[mood]:
                mood_keyword_count_dict[mood] += 1
                break
            
    # compute the assigned mood (if no mood assigned return None) 
    mood_counts = list(mood_keyword_count_dict.values())
    if max(mood_counts) > 0:
        assigned_mood_idx = np.argmax(mood_counts)
        return list(mood_keyword_count_dict.keys())[assigned_mood_idx]
    else:
        return None

In [47]:
# create mood column to df
df["Mood"] = df["Tags"].apply(get_mood_from_tags)
df = df[df["Mood"].notnull()]

In [48]:
df.head(10)

Unnamed: 0,SName,SLink,Lyric,Artist,Genres,Tags,Mood
17,Signs,/claudia-leitte/signs.html,i'm lying alone on the floor\nwith a feather i...,Claudia Leitte,Pop; Axé; Romântico,"claudia leitte, Soundtrack, pop, romantic, bra...",romantic
24,I Miss Her,/olodum/i-miss-her.html,oh lord\ni'd like to know where she is now\nif...,Olodum,Axé,"brasil, Axe, brazil, carnaval, good for dancin...",happy
38,Halo,/beyonce/halo.html,remember those walls i built\nwell baby they'r...,Beyoncé,Pop; R&B; Black Music,"pop, rnb, beyonce, female vocalists, soul, Hal...",sad
40,If I Were A Boy,/beyonce/if-i-were-a-boy.html,if i were a boy\neven just for a day\ni'd roll...,Beyoncé,Pop; R&B; Black Music,"beyonce, rnb, pop, soul, female vocalist, fema...",grief
41,Love On Top,/beyonce/love-on-top.html,"bring the beat in!\n\nhoney, honey\ni can see ...",Beyoncé,Pop; R&B; Black Music,"soul, rnb, beyonce, pop, female vocalists, gro...",happy
43,Irreplaceable,/beyonce/irreplaceable.html,to the left..\nto the left..\n\nto the left to...,Beyoncé,Pop; R&B; Black Music,"rnb, beyonce, pop, female vocalists, soul, irr...",romantic
46,Run The World,/beyonce/run-the-world-girls.html,"girls, we run this motha!\ngirls, we run this ...",Beyoncé,Pop; R&B; Black Music,"rnb, dance, beyonce, pop, powerful, female voc...",upbeat
47,Listen,/beyonce/listen.html,"listen,\nto the song here in my heart\na melod...",Beyoncé,Pop; R&B; Black Music,"rnb, soul, Soundtrack, female vocalists, beyon...",grief
53,Single Ladies,/beyonce/single-ladies-put-a-ring-on-it.html,all the single ladies \nall the single ladies ...,Beyoncé,Pop; R&B; Black Music,"beyonce, pop, rnb, dance, female vocalists, Hi...",happy
54,Hello,/beyonce/hello.html,oh!\noooooh\ni love to see you walking into th...,Beyoncé,Pop; R&B; Black Music,"beyonce, rnb, pop, female vocalists, soul, Lov...",earnest


In [51]:
print(f"Number of songs with mood: {len(df)}")

Number of songs with mood: 27705


## Clean keywords from Lyrics
Some lyrics contain words like "verse1" or "chorus" instead of the full text, if there is repetition for example. 
In this step we will define those keywords (manually extracted from data) and remove them. 

In [41]:
# Lowercase
df["Lyric"] = df["Lyric"].str.lower()

In [42]:
df[df["Lyric"].str.contains("verse")]["Lyric"].tolist()[0]

"come take my hand\ni won't let you go\ni'll be your friend\ni will love you so deeply\ni will be the one to kiss you at night\ni will love you until the end of time\n\ni will be your baby\npromise not to let you go\nlove you like crazy\nsay you'll never let me go\n(4x)\n\ntake you away from here\nthere's nothing between us but space and time\ni'll be your own little star\ni'll be shining your world\nin your own little universe\ni'll be your girl\n\ncome take my hand\ni won't let you go\ni'll be your friend\ni will love you so deeply\ni will be the one to kiss you at night\ni will love you until the end of time\n\ni will be your baby\npromise not to let you go\nlove you like crazy\nsay you'll never let me go\n(4x)\n\nbaby, come on\nget up on this\nshow me that you really want it\ni wanna be the one to love you\nbaby lets go (let's go)\n\ni wanna provide\nthis loving that you're giving\ni ain't frontin' on this love\ncan you let me love you from your head to toe\nbaby let's go\n\nboy co

In [43]:
# TODO: Further keyword exploration what to filter (verse, chorus, <artist name>, <things in specific paranthesis>) 
# TODO: Check how much remains of verse, chorus, ... if I directly remove "[]"
# TODO: Check other papers implementations (from kaggle dataset) for further preprocessing

# Check for individual keywords
import re

keyword = "chorus" # chorus, verse
lyrics_with_keyword = df[df["Lyric"].str.contains(keyword)]["Lyric"].tolist()

cutted_lyrics_with_keyword = []
for lyrics in lyrics_with_keyword:
    
    keyword_positions = [m.start() for m in re.finditer(keyword, lyrics)]
    for keyword_position in keyword_positions:
        start_idx = max(0, keyword_position - 10)
        end_idx = min(len(lyrics) - 1, keyword_position + 10)
        cutted_lyrics_with_keyword.append(lyrics[start_idx : end_idx])
        
for cutted_lyrics_with_keyword_element in cutted_lyrics_with_keyword:
    print(repr(cutted_lyrics_with_keyword_element))

# TODO: What to filter out?
# - [...]
# \nverse/chorus <number>\n

'ten....\n\n[chorus]\nli'
'sten...\n\n[chorus]\n\ni'
'[chorus]\nwh'
'et low]\n\n[chorus]\nwh'
', whoa]\n\n[chorus]\nwh'
'back up\n\n[chorus]\nwh'
' creole\n\n[chorus - a'
'elicacy\n\n[chorus - b'
' creole\n\n[chorus - a'
' creole\n\n[chorus - b'
'um dress\n\nchorus\nooh'
'um dress\n\nchorus\nooh'
'um dress\n\nchorus\nooh'
'you lied\n\nchorus:\ni '
'you lied\n\nchorus:\ni '
'you lied\n\nchorus:\ni '
'e to me\n\n[chorus]\nse'
'e to me\n\n[chorus]\nse'
'rth\n\n[pre-chorus]\nan'
' belong\n\n[chorus]\ni '
'old\n\n[pre-chorus]\nan'
' belong\n\n[chorus]\ni '
'rth\n\n[pre-chorus]\nan'
' belong\n\n[chorus]\ni '
"e to go.\n\nchorus:\n'c"
"he same.\n\nchorus:\n'c"
"d to go.\n\nchorus:\n'c"
'o it all\n\nchorus:\nle'
' perform\n\nchorus\nlet'
'eah yeah\n\nchorus\nlet'
'e begin\n\n[chorus - b'
'e begin\n\n[chorus]\nit'
'\nyeah..\n\n[chorus]\nit'
'r pride\n\n[chorus]\nwa'
'e night\n\n[chorus]\nwa'
'ight...\n\n[chorus]\nwa'
'o throw\n\n{chorus}\n\ni'
'reaking\n\n{chorus}\n\na'
'er case\n\n[chorus]\nea

In [44]:
# TODO: Advance the regex | Be careful with words containing the keywords like uni-verse
# df = df.replace("verse|chorus", "", regex=True)
df = df.replace("\[[.]*\]", "", regex=True)
df = df.replace("\[[\- \s a-z A-Z 0-9 :]*\]", "", regex=True) # TODO: Adjust -> Still does not work
df = df.replace("\([\- \s a-z A-Z 0-9]*\)", "", regex=True)
df = df.replace("\nchorus\n", "", regex=True)
df = df.replace("::chorus::", "", regex=True)
df = df.replace("\[chorus", "", regex=True)
df = df.replace("\nchorus", "", regex=True)
df = df.replace("pre[-]?chorus", "", regex=True)
df = df.replace("\(chorus:\)", "", regex=True)
df = df.replace("\{chorus\}", "", regex=True)
df = df.replace("\nrepeat chorus\n", "", regex=True)

# TODO: Remove: [50 cent - chorus (+kidd kidd)], (chorus: akon), {*starts singing chorus in background*} <-- all sorts of paranthesis
# TODO: Remove: 2x?
# TODO: Remove all artists names

In [45]:
# TODO: Adjust like above

# Check if keywords were removed correctly
import re

keyword = "chorus" # chorus, verse
lyrics_with_keyword = df[df["Lyric"].str.contains(keyword)]["Lyric"].tolist()

cutted_lyrics_with_keyword = []
window_length = 40
for lyrics in lyrics_with_keyword:
    
    keyword_positions = [m.start() for m in re.finditer(keyword, lyrics)]
    for keyword_position in keyword_positions:
        start_idx = max(0, keyword_position - window_length)
        end_idx = min(len(lyrics) - 1, keyword_position + window_length)
        cutted_lyrics_with_keyword.append(lyrics[start_idx : end_idx])
        
for cutted_lyrics_with_keyword_element in cutted_lyrics_with_keyword:
    print(repr(cutted_lyrics_with_keyword_element))

"'s not over, just another again\n\nsecond chorus\n(and again, and again, and again)"
"vin' care\ni'll take you there\n\n\n(repeat chorus - ad-lib/fade-out"
'chorus:\n\nchildren, groovin, women, confu'
"makes me feel this way\n(take em' to the chorus!)\n\ncome here girl!\n(go 'head be g"
"akes me feel this way\n\n(take em' to the chorus!)\n\ncome here girl!\n\n(go 'head be "
" me, and that's a fact\n(take em' to the chorus!)\n\ncome here girl!\n(go 'head be g"
"hen i'm rollin' by\nniggas can't c me\n\n\n(chorus -- g. clinton)\n\n\nthe stares of a "
's down with g-o-d?\n\n\n\ncome and join the chorus\nthe mighty, mighty chorus\nwhich t'
' and join the chorus\nthe mighty, mighty chorus\nwhich the morning stars begun\nthe'
' good good good good good good \n\nrepeat chorus til end'
"chorus: nelly\n\nhmmmmm\ni'm goin down down"
'it,\nless than four bars,\nguru bring the chorus in,\ndid you get the picture yet,\n'
'chorus\npresidents to represent me - nas '
'chorus\n\ni know\ni know\nsome pl

## Save data

In [46]:
song_data_labels_cleaned_path = os.path.join(DATA_DIR, "song-data-labels-cleaned.csv")
df_songs_no_duplicates.to_csv(song_data_labels_cleaned_path, index=False)