In [75]:
import os
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st

In [76]:
DATA_DIR = "data"

## Load data

In [77]:
# lyrics data path
lyrics_path = os.path.join(DATA_DIR, "lyrics-data.csv")
df_lyrics = pd.read_csv(lyrics_path)

# artists data path
artists_path = os.path.join(DATA_DIR, "artists-data.csv")
df_artists = pd.read_csv(artists_path)

## Remove non english songs

In [78]:
# Filter for language en = english
df_lyrics_en = df_lyrics[df_lyrics["language"] == "en"]

## Remove songs with less than 70 and more than 1000 words

In [79]:
# Split each string column and count length
df_lyrics_en = df_lyrics_en[
    df_lyrics_en["Lyric"].apply(lambda lyric: (len(lyric.split()) >= 70) & (len(lyric.split()) <= 1000))
]

## Merge songs and artists

In [80]:
# Merge artists and lyrics
df_songs = pd.merge(
    df_lyrics_en, 
    df_artists, 
    how="inner",
    left_on="ALink",
    right_on="Link"
)

# Remove duplicate columns
del df_songs["Link"]

## Remove columns with duplicate or redundant information

In [81]:
del df_songs["ALink"]
del df_songs["language"]
del df_songs["Songs"]
del df_songs["Popularity"]

## Remove duplicate rows

In [82]:
df_songs_no_duplicates = df_songs.drop_duplicates(subset=['SLink'])

## Add labels from Last.fm

In [83]:
tags_df= pd.read_csv('./data/tags.csv')

In [84]:
df = pd.merge(df_songs_no_duplicates, tags_df,  how='left', left_on=["Artist","SName"], right_on = ["Artist","SName"])

In [85]:
df = df[(df["Tags"] != "NoTagsFound") & (df["Tags"] != "NoSongFound")]

In [86]:
df.head()

Unnamed: 0,SName,SLink,Lyric,Artist,Genres,Tags
3,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",Ivete Sangalo,Pop; Axé; Romântico,"pop, female vocalists, rnb, hot, spanish, soul..."
5,Human Nature,/ivete-sangalo/human-nature.html,Looking out\nAcross the night time\nThe city w...,Ivete Sangalo,Pop; Axé; Romântico,"pop, rock, female vocalists, dance, latin, rnb..."
9,Natural Collie,/ivete-sangalo/natural-collie.html,Been down in the valley\nSmoking natural colli...,Ivete Sangalo,Pop; Axé; Romântico,"spanish, electronic, female, jazz, hip hop, po..."
10,Where It Begins (feat. Nelly Furtado),/ivete-sangalo/where-it-begins-feat-nelly-furt...,"When you're alone and you don't know how,\nTo ...",Ivete Sangalo,Pop; Axé; Romântico,"pop, singer-songwriter, rnb, Nelly Furtado, sp..."
15,Lazy Groove,/claudia-leitte/lazy-groove.html,Are you ready to dance?\nTo make your body cra...,Claudia Leitte,Pop; Axé; Romântico,"dance, cool, 2013, axe music, claudia leitte"


## Mood ground truth

In [87]:
MOOD_CATEGORIES = {
    'calm': ['calm', 'comfort', 'quiet', 'serene', 'mellow', 'chill out'],
    'sad': ['sadness', 'unhappy', 'melancholic', 'melancholy'],
    'happy': ['happy', 'happiness', 'happy songs', 'happy music'],
    'romantic': ['romantic', 'romantic music'],
    'upbeat': ['upbeat', 'gleeful', 'high spirits', 'zest', 'enthusiastic'],
    'depressed': ['depressed', 'blue', 'dark', 'depressive', 'dreary'],
    'anger': ['anger', 'angry', 'choleric', 'fury', 'outraged', 'rage'],
    'grief': ['grief', 'heartbreak', 'mournful', 'sorrow', 'sorry'],
    'dreamy': ['dreamy'],
    'cheerful': ['cheerful', 'cheer up', 'festive', 'jolly', 'jovial', 'merry'],
    'brooding': ['brooding', 'contemplative', 'meditative', 'reflective'],
    'aggression': ['aggression', 'aggressive'],
    'confident': ['confident', 'encouraging', 'encouragement', 'optimism'],
    'angst': ['angst', 'anxiety', 'anxious', 'jumpy', 'nervous', 'angsty'],
    'earnest': ['earnest', 'heartfelt'],
    'desire': ['desire', 'hope', 'hopeful', 'mood: hopeful'],
    'pessimism': ['pessimism', 'cynical', 'pessimistic', 'weltschmerz'],
    'excitement': ['excitement', 'exciting', 'exhilarating', 'thrill', 'ardor']
}

In [88]:
def get_mood_from_tags(tags):
    """Get mood from tags. If tags does not contain tags mapped to moods then mood is None.

    :param tags: tags of the song.
    :ptype: String
    
    :return: Mood for given tags.
    :rtype: String or None
    """
    
    moods = list(MOOD_CATEGORIES.keys())
    
    # initialize mood_keyword_count_dict
    mood_keyword_count_dict = dict(zip(moods, [0]*len(MOOD_CATEGORIES)))
    
    # create tags list
    if isinstance(tags, str) and tags not in ["NoTagsFound", "NoSongFound"]:
        tags = tags.split(", ")
    else:
        # if no tags available then return None 
        return None

    # compute mood count for all moods based on if tags contains tags mapped to these moods
    for tag in tags:
        for mood in moods:
            # TODO: Advance to check also if keyword is contained in the tag (to get more moods assigned)
            if tag in MOOD_CATEGORIES[mood]:
                mood_keyword_count_dict[mood] += 1
                break
            
    # compute the assigned mood (if no mood assigned return None) 
    mood_counts = list(mood_keyword_count_dict.values())
    if max(mood_counts) > 0:
        assigned_mood_idx = np.argmax(mood_counts)
        return list(mood_keyword_count_dict.keys())[assigned_mood_idx]
    else:
        return None

In [89]:
# create mood column to df
df["Mood"] = df["Tags"].apply(get_mood_from_tags)
df = df[df["Mood"].notnull()]

In [90]:
df.head(10)

Unnamed: 0,SName,SLink,Lyric,Artist,Genres,Tags,Mood
17,Signs,/claudia-leitte/signs.html,I'm lying alone on the floor\nWith a feather I...,Claudia Leitte,Pop; Axé; Romântico,"claudia leitte, Soundtrack, pop, romantic, bra...",romantic
24,I Miss Her,/olodum/i-miss-her.html,Oh lord\nI'd like to know where she is now\nIf...,Olodum,Axé,"brasil, Axe, brazil, carnaval, good for dancin...",happy
38,Halo,/beyonce/halo.html,Remember those walls I built\nWell baby they'r...,Beyoncé,Pop; R&B; Black Music,"pop, rnb, beyonce, female vocalists, soul, Hal...",sad
40,If I Were A Boy,/beyonce/if-i-were-a-boy.html,If I were a Boy\nEven Just for a day\nI'd Roll...,Beyoncé,Pop; R&B; Black Music,"beyonce, rnb, pop, soul, female vocalist, fema...",grief
41,Love On Top,/beyonce/love-on-top.html,"Bring the beat in!\n\nHoney, honey\nI can see ...",Beyoncé,Pop; R&B; Black Music,"soul, rnb, beyonce, pop, female vocalists, gro...",happy
43,Irreplaceable,/beyonce/irreplaceable.html,To the left..\nTo the left..\n\nTo the left to...,Beyoncé,Pop; R&B; Black Music,"rnb, beyonce, pop, female vocalists, soul, irr...",romantic
46,Run The World (Girls),/beyonce/run-the-world-girls.html,"Girls, we run this motha!\nGirls, we run this ...",Beyoncé,Pop; R&B; Black Music,"rnb, dance, beyonce, pop, powerful, female voc...",upbeat
47,Listen,/beyonce/listen.html,"Listen,\nTo the song here in my heart\nA melod...",Beyoncé,Pop; R&B; Black Music,"rnb, soul, Soundtrack, female vocalists, beyon...",grief
53,Single Ladies (Put A Ring On It),/beyonce/single-ladies-put-a-ring-on-it.html,All the single ladies (All the single ladies)\...,Beyoncé,Pop; R&B; Black Music,"beyonce, pop, rnb, dance, female vocalists, Hi...",happy
54,Hello,/beyonce/hello.html,Oh!\nOooooh\nI love to see you walking into th...,Beyoncé,Pop; R&B; Black Music,"beyonce, rnb, pop, female vocalists, soul, Lov...",earnest


In [91]:
print(f"Number of songs with mood: {len(df)}")

Number of songs with mood: 27705


## Clean keywords from Lyrics
Some lyrics contain words like "verse1" or "chorus" instead of the full text, if there is repetition for example. 
In this step we will define those keywords (manually extracted from data) and remove them. 

In [92]:
# lowercase the lyrics
df["Lyric"] = df["Lyric"].str.lower()

### Lyrics keywords exploration
Use these two cells to check for unwanted patterns in text and also to check whether you have filtered this pattern out correctly

In [93]:
# print one example lyrics containing specific keyword (e.g. "verse"). Might also be helpful to spot further unwanted patterns in the lyrics (like "(4x)")
df[df["Lyric"].str.contains("verse")]["Lyric"].tolist()[0]

"come take my hand\ni won't let you go\ni'll be your friend\ni will love you so deeply\ni will be the one to kiss you at night\ni will love you until the end of time\n\ni will be your baby\npromise not to let you go\nlove you like crazy\nsay you'll never let me go\n(4x)\n\ntake you away from here\nthere's nothing between us but space and time\ni'll be your own little star\ni'll be shining your world\nin your own little universe\ni'll be your girl\n\ncome take my hand\ni won't let you go\ni'll be your friend\ni will love you so deeply\ni will be the one to kiss you at night\ni will love you until the end of time\n\ni will be your baby\npromise not to let you go\nlove you like crazy\nsay you'll never let me go\n(4x)\n\nbaby, come on\nget up on this\nshow me that you really want it\ni wanna be the one to love you\nbaby lets go (let's go)\n\ni wanna provide\nthis loving that you're giving\ni ain't frontin' on this love\ncan you let me love you from your head to toe\nbaby let's go\n\nboy co

In [96]:
# print text parts of size window_length with parts of lyrics that are containing a certain keyword
keyword = "chorus" # chorus, verse
lyrics_with_keyword = df[df["Lyric"].str.contains(keyword)]["Lyric"].tolist()

# defines how much text around the keyword should be extracted (to obtain all various combinations of that keyword)
window_length = 40
cutted_lyrics_with_keyword = []
for lyric in lyrics_with_keyword:
    # get all occurences positions of that keyword within given lyric
    keyword_positions = [m.start() for m in re.finditer(keyword, lyric)]
    for keyword_position in keyword_positions:
        start_idx = max(0, keyword_position - window_length)
        end_idx = min(len(lyric) - 1, keyword_position + window_length)
        cutted_lyrics_with_keyword.append(lyric[start_idx : end_idx])

# output all parts with given keyword       
for cutted_lyrics_with_keyword_element in cutted_lyrics_with_keyword:
    print(repr(cutted_lyrics_with_keyword_element))

"'s not over, just another again\n\nsecond chorus\n(and again, and again, and again)"
"vin' care\ni'll take you there\n\n\n(repeat chorus - ad-lib/fade-out"
'chorus:\n\nchildren, groovin, women, confu'
"makes me feel this way\n(take em' to the chorus!)\n\ncome here girl!\n(go 'head be g"
"akes me feel this way\n\n(take em' to the chorus!)\n\ncome here girl!\n\n(go 'head be "
" me, and that's a fact\n(take em' to the chorus!)\n\ncome here girl!\n(go 'head be g"
"hen i'm rollin' by\nniggas can't c me\n\n\n(chorus -- g. clinton)\n\n\nthe stares of a "
's down with g-o-d?\n\n\n\ncome and join the chorus\nthe mighty, mighty chorus\nwhich t'
' and join the chorus\nthe mighty, mighty chorus\nwhich the morning stars begun\nthe'
' good good good good good good \n\nrepeat chorus til end'
"chorus: nelly\n\nhmmmmm\ni'm goin down down"
'it,\nless than four bars,\nguru bring the chorus in,\ndid you get the picture yet,\n'
'chorus\npresidents to represent me - nas '
'chorus\n\ni know\ni know\nsome pl

### Keyword filtering steps

In [95]:
# TODO: fix all paranthesis cleaning variations
# filter everything in []
df = df.replace("\[[.]*\]", "", regex=True) 
# filter everything in [] new variant # TODO: Remove that only one variation exists
df = df.replace("\[[\- \s a-z A-Z 0-9 :]*\]", "", regex=True)
# filter everything in ()
df = df.replace("\([\- \s a-z A-Z 0-9]*\)", "", regex=True)
# filter everything in {}
df = df.replace("\{[\- \s a-z A-Z 0-9]*\}", "", regex=True)
# filter everything in <>
df = df.replace("\{[\- \s a-z A-Z 0-9]*\}", "", regex=True)

# TODO: general chorus/verse combinations with a number afterwards: "\nverse/chorus <number>\n"
# TODO: filter more specific combinations of chorus (be careful with words containing the keywords like uni-verse)
df = df.replace("\nchorus\n", "", regex=True)
df = df.replace("::chorus::", "", regex=True)
df = df.replace("\[chorus", "", regex=True)
df = df.replace("\nchorus", "", regex=True)
df = df.replace("pre[-]?chorus", "", regex=True)
df = df.replace("\(chorus:\)", "", regex=True)
df = df.replace("\{chorus\}", "", regex=True)
df = df.replace("\nrepeat chorus\n", "", regex=True)

# TODO: ideas for further cleaning:
    # TODO: remove all artists names
    # TODO: replace \n, \w, \t,... 

## Save data

In [97]:
song_data_labels_cleaned_path = os.path.join(DATA_DIR, "song-data-labels-cleaned.csv")
df.to_csv(song_data_labels_cleaned_path, index=False)