In [1]:
import requests
import time

import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import requests_cache

requests_cache.install_cache()

# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'

%matplotlib inline

Pretty printing has been turned OFF


# Artist & Song Dataframe

Load lyrics dataframe 

In [2]:
lyrics_df = pd.read_csv("../data/lyrics-data.csv")
artists_df = pd.read_csv('../data/artists-data.csv')

Show general information for the dataframes

Create dataframe with song name and artist name (only English songs)

In [3]:
artists_songs_dataframe = pd.merge(artists_df, lyrics_df, how="inner", left_on="Link", right_on="ALink")
artists_songs_dataframe = artists_songs_dataframe[artists_songs_dataframe["language"] == "en"][["Artist", "SName"]]

In [4]:
# Number of English songs
print(f"Number of English songs: {artists_songs_dataframe['SName'].count()}")

# Number of all artists with English songs
print(f"Number of all artists with English songs: {artists_songs_dataframe['Artist'].nunique()}")

Number of English songs: 191385
Number of all artists with English songs: 2488


In [5]:
# Take first third of the data (191385 / 3 = 63795)
artists_songs_dataframe = artists_songs_dataframe[:63795]

# LastFM API (download tags)

Source: https://www.dataquest.io/blog/last-fm-api-python/

In [6]:
API_KEY = 'b0fc842b94c2e6a0b4458c1686a24f61'
USER_AGENT = 'idsta_project'

In [7]:
def lastfm_get(payload):
    # define headers and URL
    headers = {'user-agent': USER_AGENT}
    url = 'https://ws.audioscrobbler.com/2.0/'

    # Add API key and format to the payload
    payload['api_key'] = API_KEY
    payload['format'] = 'json'

    response = requests.get(url, headers=headers, params=payload)
    return response

Get top tags for all songs

In [8]:
def lookup_tags_for_artist_song(row):
    artist, song = row["Artist"], row["SName"]
    
    response = lastfm_get({
        'method': 'track.getTopTags',
        'artist':  artist,
        'track': song
    })

    # if there's an error, just return nothing
    if response.status_code != 200 or "error" in response.text:
        return "NoSongFound"

    # extract the top tags and turn them into a string
    tags = [t['name'] for t in response.json()['toptags']['tag']]
    tags_str = ', '.join(tags)
    if tags_str == "":
        tags_str ="NoTagsFound" 
    # TODO: Could later also look at track tag count

    # rate limiting
    if not getattr(response, 'from_cache', False):
        # TODO: Can I reduce the time?
        time.sleep(0.25)
    return tags_str

In [10]:
tqdm.pandas()

artists_songs_dataframe['Tags'] = artists_songs_dataframe[["Artist", "SName"]].progress_apply(lambda row: lookup_tags_for_artist_song(row), axis=1)

100%|██████████| 10000/10000 [27:50<00:00,  5.98it/s] 


In [11]:
# Save df
artists_songs_dataframe.to_csv('data/tags_first_third.csv', index=False)

In [12]:
df = pd.read_csv('data/tags_first_third.csv')
df.head(20)

Unnamed: 0,Artist,SName,Tags
0,Ivete Sangalo,Careless Whisper,NoTagsFound
1,Ivete Sangalo,Could You Be Loved / Citação Musical do Rap: S...,NoTagsFound
2,Ivete Sangalo,Cruisin' (Part. Saulo),NoTagsFound
3,Ivete Sangalo,Easy,"pop, female vocalists, rnb, hot, spanish, soul..."
4,Ivete Sangalo,For Your Babies (The Voice cover),NoSongFound
5,Ivete Sangalo,Human Nature,"pop, rock, female vocalists, dance, latin, rnb..."
6,Ivete Sangalo,Losing Control (Miss Cady feat. Ivete Sangalo),NoSongFound
7,Ivete Sangalo,Master Blaster (Jammin'),NoTagsFound
8,Ivete Sangalo,More Than Words,NoTagsFound
9,Ivete Sangalo,Natural Collie,"spanish, electronic, female, jazz, hip hop, po..."
