In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import scipy.stats as st

In [2]:
DATA_DIR = "data"

# Load data

In [3]:
# lyrics data path
lyrics_path = os.path.join(DATA_DIR, "lyrics-data.csv")
df_lyrics = pd.read_csv(lyrics_path)

# artists data path
artists_path = os.path.join(DATA_DIR, "artists-data.csv")
df_artists = pd.read_csv(artists_path)

## Remove non english songs

In [4]:
# Filter for language en = english
df_lyrics_en = df_lyrics[df_lyrics["language"] == "en"]

## Remove songs with less than 70 and more than 1000 words

In [5]:
# Split each string column and count length
df_lyrics_en = df_lyrics_en[
    df_lyrics_en["Lyric"].apply(lambda lyric: (len(lyric.split()) >= 70) & (len(lyric.split()) <= 1000))
]

## Merge songs and artists

In [6]:
# Merge artists and lyrics
df_songs = pd.merge(
    df_lyrics_en, 
    df_artists, 
    how="inner",
    left_on="ALink",
    right_on="Link"
)

# Remove duplicate columns
del df_songs["Link"]

## Remove columns with duplicate or redundant information

In [7]:
del df_songs["ALink"]
del df_songs["language"]
del df_songs["Songs"]
del df_songs["Popularity"]

## Remove duplicate rows

In [8]:
df_songs_no_duplicates = df_songs.drop_duplicates(subset=['SLink'])

## Add labels from Last.fm

In [9]:
tags_df= pd.read_csv('./data/tags.csv')

In [10]:
df = pd.merge(df_songs_no_duplicates, tags_df,  how='left', left_on=["Artist","SName"], right_on = ["Artist","SName"])

In [11]:
df = df[(df["Tags"] != "NoTagsFound") & (df["Tags"] != "NoSongFound")]

In [12]:
df.head()

Unnamed: 0,SName,SLink,Lyric,Artist,Genres,Tags
3,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",Ivete Sangalo,Pop; Axé; Romântico,"pop, female vocalists, rnb, hot, spanish, soul..."
5,Human Nature,/ivete-sangalo/human-nature.html,Looking out\nAcross the night time\nThe city w...,Ivete Sangalo,Pop; Axé; Romântico,"pop, rock, female vocalists, dance, latin, rnb..."
9,Natural Collie,/ivete-sangalo/natural-collie.html,Been down in the valley\nSmoking natural colli...,Ivete Sangalo,Pop; Axé; Romântico,"spanish, electronic, female, jazz, hip hop, po..."
10,Where It Begins (feat. Nelly Furtado),/ivete-sangalo/where-it-begins-feat-nelly-furt...,"When you're alone and you don't know how,\nTo ...",Ivete Sangalo,Pop; Axé; Romântico,"pop, singer-songwriter, rnb, Nelly Furtado, sp..."
15,Lazy Groove,/claudia-leitte/lazy-groove.html,Are you ready to dance?\nTo make your body cra...,Claudia Leitte,Pop; Axé; Romântico,"dance, cool, 2013, axe music, claudia leitte"


## Clean keywords from lyrics
Some lyrics contain words like "verse1" or "chorus" instead of the full text, if there is repetition for example. 
In this step we will define those keywords (manually extracted from data) and remove them. 

In [13]:
# Lowercase
df["Lyric"] = df["Lyric"].str.lower()

In [14]:
df[df["Lyric"].str.contains("verse")]["Lyric"].tolist()[0]

"[intro - jay z]\nyes!\nit's so crazy right now!\nmost incredibly, it's ya girl, bee\nit's ya boy, young\n\n[intro - beyonce:]\nyou ready?\nuh oh, uh oh, uh oh, oh no no\nuh oh, uh oh, uh oh, oh no no\nuh oh, uh oh, uh oh, oh no no\nuh oh, uh oh, uh oh, oh no no\n\n[intro - jay z]\nyea, history in the making\npart 2, it's so crazy right now\n\n[verse 1 - beyonce]\ni look and stare so deep in your eyes\ni touch on you more and more every time\nwhen you leave i'm begging you not to go\ncall your name two or three times in a row\nsuch a funny thing for me to try to explain\nhow i'm feeling and my pride is the one to blame\n'cuz i know i don't understand\njust how your love can do what no one else can\n\n[chorus - beyonce]\ngot me looking so crazy right now, your love's\ngot me looking so crazy right now (in love)\ngot me looking so crazy right now, your touch\ngot me looking so crazy right now (your touch)\ngot me hoping you'll page me right now, your kiss\ngot me hoping you'll save me ri

In [15]:
# TODO: Further keyword exploration what to filter (verse, chorus, <artist name>, <things in specific paranthesis>) 
# TODO: Check how much remains of verse, chorus, ... if I directly remove "[]"
# TODO: Check other papers implementations (from kaggle dataset) for further preprocessing

# Check for individual keywords
import re

keyword = "chorus" # chorus, verse
lyrics_with_keyword = df[df["Lyric"].str.contains(keyword)]["Lyric"].tolist()

cutted_lyrics_with_keyword = []
for lyrics in lyrics_with_keyword:
    
    keyword_positions = [m.start() for m in re.finditer(keyword, lyrics)]
    for keyword_position in keyword_positions:
        start_idx = max(0, keyword_position - 10)
        end_idx = min(len(lyrics) - 1, keyword_position + 10)
        cutted_lyrics_with_keyword.append(lyrics[start_idx : end_idx])
        
for cutted_lyrics_with_keyword_element in cutted_lyrics_with_keyword:
    print(repr(cutted_lyrics_with_keyword_element))

# TODO: What to filter out?
# - [...]
# \nverse/chorus <number>\n

" don't go\nchorus\n\noh"
'lse can\n\n[chorus - b'
'lse can\n\n[chorus - b'
'by\nhey!\n\n[chorus - b'
'ten....\n\n[chorus]\nli'
'sten...\n\n[chorus]\n\ni'
'\nholla\n\n::chorus::\na'
'trenght\n\n[chorus]\nmy'
'er part\n\n[chorus]\nmy'
'ing you\n\n[chorus\nmy '
' now ya\n\n(chorus)\nba'
'e, girl\n\n(chorus)\nba'
'e to me\n\n(chorus)\nba'
't world\n\n(chorus)\nba'
'[chorus]\nwh'
'et low]\n\n[chorus]\nwh'
', whoa]\n\n[chorus]\nwh'
'back up\n\n[chorus]\nwh'
'ure show\n\nchorus:\n\nb'
'10, ohhh\n\nchorus:\n\nb'
' creole\n\n[chorus - a'
'elicacy\n\n[chorus - b'
' creole\n\n[chorus - a'
' creole\n\n[chorus - b'
'ou\n\nhey\n\n[chorus]\nev'
'r as day\n\nchorus:\ni '
'ean (ah)\n\nchorus ou'
'um dress\n\nchorus\nooh'
'um dress\n\nchorus\nooh'
'um dress\n\nchorus\nooh'
' to give\n\nchorus:\nho'
' believe\n\nchorus:\nho'
'end upon\n\nchorus:\nho'
' yet\n\npre-chorus:\ngo'
' head...\n\nchorus:\ni '
'mine\n\npre-chorus:\ngo'
'no more)\n\nchorus:\ni '
' know...\n\nchorus:\ni '
'ht\nbaby\n\n[chorus:]\ni

In [42]:
# TODO: Advance the regex | Be careful with words containing the keywords like uni-verse
# df = df.replace("verse|chorus", "", regex=True)
df = df.replace("\[[.]*\]", "", regex=True)
df = df.replace("\[[\- \s a-z A-Z 0-9 :]*\]", "", regex=True) # TODO: Adjust -> Still does not work
df = df.replace("\([\- \s a-z A-Z 0-9]*\)", "", regex=True)
df = df.replace("\nchorus\n", "", regex=True)
df = df.replace("::chorus::", "", regex=True)
df = df.replace("\[chorus", "", regex=True)
df = df.replace("\nchorus", "", regex=True)
df = df.replace("pre[-]?chorus", "", regex=True)
df = df.replace("\(chorus:\)", "", regex=True)
df = df.replace("\{chorus\}", "", regex=True)
df = df.replace("\nrepeat chorus\n", "", regex=True)

# TODO: Remove: [50 cent - chorus (+kidd kidd)], (chorus: akon), {*starts singing chorus in background*} <-- all sorts of paranthesis
# TODO: Remove: 2x?
# TODO: Remove all artists names

In [43]:
# TODO: Adjust like above

# Check if keywords were removed correctly
import re

keyword = "chorus" # chorus, verse
lyrics_with_keyword = df[df["Lyric"].str.contains(keyword)]["Lyric"].tolist()

cutted_lyrics_with_keyword = []
window_length = 40
for lyrics in lyrics_with_keyword:
    
    keyword_positions = [m.start() for m in re.finditer(keyword, lyrics)]
    for keyword_position in keyword_positions:
        start_idx = max(0, keyword_position - window_length)
        end_idx = min(len(lyrics) - 1, keyword_position + window_length)
        cutted_lyrics_with_keyword.append(lyrics[start_idx : end_idx])
        
for cutted_lyrics_with_keyword_element in cutted_lyrics_with_keyword:
    print(repr(cutted_lyrics_with_keyword_element))

'chorus:\nfresh out the mall in our brand '
'chorus:\njust as long\nas it makes you hap'
"an\ntell him upclose, now let's sing the chorus\n\nif you like what you see, then y"
'auge shotty (feel me!)\n{*starts singing chorus in background*}\nand this bitch sa'
"chorus]\ni don't need dom perignon, i don"
'chorus]\nall thru the hood i keep hearing'
"l do it\ni like gettin' high\n\n[50 cent - chorus (+kidd kidd)]\ni roll the shit, i "
'd if you niggas wanna smoke\n\n[50 cent - chorus (+kidd kidd)]\ni roll the shit, i '
"e shit don't aks me shit ah\n\n[50 cent - chorus (+kidd kidd)]\ni roll the shit, i "
'[50 cent ? chorus]\ni?m so disrespectful,\nso so so d'
"'s not over, just another again\n\nsecond chorus\n(and again, and again, and again)"
"vin' care\ni'll take you there\n\n\n(repeat chorus - ad-lib/fade-out"
'chorus:\n\nchildren, groovin, women, confu'
'ues?\nease up baby, ease up baby, cos\'\n\n"chorus"\n\nlove ain\'t technological, like\n'
'before us\n\nstrike the harp and join the c

## Save data

In [18]:
song_data_labels_cleaned_path = os.path.join(DATA_DIR, "song-data-labels-cleaned.csv")
df_songs_no_duplicates.to_csv(song_data_labels_cleaned_path, index=False)