In [4]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import scipy.stats as st

In [5]:
DATA_DIR = "data"

# Load data

In [6]:
# lyrics data path
lyrics_path = os.path.join(DATA_DIR, "lyrics-data.csv")
df_lyrics = pd.read_csv(lyrics_path)

# artists data path
artists_path = os.path.join(DATA_DIR, "artists-data.csv")
df_artists = pd.read_csv(artists_path)

## Remove non english songs

In [7]:
# Filter for language en = english
df_lyrics_en = df_lyrics[df_lyrics["language"] == "en"]

## Remove songs with less than 70 and more than 1000 words

In [8]:
# Split each string column and count length
df_lyrics_en = df_lyrics_en[
    df_lyrics_en["Lyric"].apply(lambda lyric: (len(lyric.split()) >= 70) & (len(lyric.split()) <= 1000))
]

## Merge songs and artists

In [9]:
# Merge artists and lyrics
df_songs = pd.merge(
    df_lyrics_en, 
    df_artists, 
    how="inner",
    left_on="ALink",
    right_on="Link"
)

# Remove duplicate columns
del df_songs["Link"]

## Remove columns with duplicate or redundant information

In [10]:
del df_songs["ALink"]
del df_songs["language"]
del df_songs["Songs"]
del df_songs["Popularity"]

## Remove duplicate rows

In [11]:
df_songs_no_duplicates = df_songs.drop_duplicates(subset=['SLink'])

## Add labels from Last.fm

In [26]:
tags_df= pd.read_csv('./data/tags.csv')

In [27]:
df = pd.merge(df_songs_no_duplicates, tags_df,  how='left', left_on=["Artist","SName"], right_on = ["Artist","SName"])

In [28]:
df = df[(df["Tags"] != "NoTagsFound") & (df["Tags"] != "NoSongFound")]

In [29]:
df.head()

Unnamed: 0,SName,SLink,Lyric,Artist,Genres,Tags
3,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",Ivete Sangalo,Pop; Axé; Romântico,"pop, female vocalists, rnb, hot, spanish, soul..."
5,Human Nature,/ivete-sangalo/human-nature.html,Looking out\nAcross the night time\nThe city w...,Ivete Sangalo,Pop; Axé; Romântico,"pop, rock, female vocalists, dance, latin, rnb..."
9,Natural Collie,/ivete-sangalo/natural-collie.html,Been down in the valley\nSmoking natural colli...,Ivete Sangalo,Pop; Axé; Romântico,"spanish, electronic, female, jazz, hip hop, po..."
10,Where It Begins (feat. Nelly Furtado),/ivete-sangalo/where-it-begins-feat-nelly-furt...,"When you're alone and you don't know how,\nTo ...",Ivete Sangalo,Pop; Axé; Romântico,"pop, singer-songwriter, rnb, Nelly Furtado, sp..."
15,Lazy Groove,/claudia-leitte/lazy-groove.html,Are you ready to dance?\nTo make your body cra...,Claudia Leitte,Pop; Axé; Romântico,"dance, cool, 2013, axe music, claudia leitte"


## Clean keywords from lyrics
Some lyrics contain words like "verse1" or "chorus" instead of the full text, if there is repetition for example. 
In this step we will define those keywords (manually extracted from data) and remove them. 

In [32]:
# Lowercase
df["Lyric"] = df["Lyric"].str.lower()

In [35]:
# Check for individual keywords
df[df["Lyric"].str.contains("chorus")].head() # chorus, verse

Unnamed: 0,SName,SLink,Lyric,Artist,Genres,Tags
31,Dyer Maker,/babado-novo/dryer-maker.html,oh oh oh oh oh oh\nyou don't have to go\noh oh...,Babado Novo,Axé; Pop; Romântico,rock
42,Crazy In Love (feat. Jay Z),/beyonce/crazy-in-love.html,[intro - jay z]\nyes!\nit's so crazy right now...,Beyoncé,Pop; R&B; Black Music,"Hip-Hop, pop, r&b, rnb, female vocalists, Love..."
47,Listen,/beyonce/listen.html,"listen,\nto the song here in my heart\na melod...",Beyoncé,Pop; R&B; Black Music,"rnb, soul, Soundtrack, female vocalists, beyon..."
63,'03 Bonnie & Clyde,/beyonce/03-bonnie-clyde.html,jay-z\nuh-uh-uh\nyou ready b?\nlet's go get 'e...,Beyoncé,Pop; R&B; Black Music,"hip hop, soul, dance, r&b, rnb, beyonce, live,..."
76,Angel (feat. Kelly Rowland),/beyonce/angel-feat-kelly-rowland.html,this is for my fans (uhu uhu)\nthis is for my ...,Beyoncé,Pop; R&B; Black Music,rnb


In [36]:
# TODO: Should we also filter numbers?
df = df.replace("verse|chorus", "", regex=True)

In [38]:
# Check if keywords were removed correctly
df[df["Lyric"].str.contains("chorus")].head() # chorus, verse

Unnamed: 0,SName,SLink,Lyric,Artist,Genres,Tags


## Save data

In [None]:
song_data_labels_cleaned_path = os.path.join(DATA_DIR, "song-data-labels-cleaned.csv")
df_songs_no_duplicates.to_csv(song_data_labels_cleaned_path, index=False)