In [12]:
import requests

import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import time
import requests_cache

requests_cache.install_cache()


# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'

%matplotlib inline

# Artist & Song Dataframe

Load lyrics dataframe 

In [13]:
lyrics_df = pd.read_csv("../data/lyrics-data.csv")
artists_df = pd.read_csv('../data/artists-data.csv')

Show general information for the dataframes

Create dataframe with song name and artist name (only English songs)

In [14]:
artists_songs_dataframe = pd.merge(artists_df, lyrics_df, how="inner", left_on="Link", right_on="ALink")
artists_songs_dataframe = artists_songs_dataframe[artists_songs_dataframe["language"] == "en"][["Artist", "SName"]]

In [15]:
# Number of English songs
print(f"Number of English songs: {artists_songs_dataframe['SName'].count()}")

# Number of all artists with English songs
print(f"Number of all artists with English songs: {artists_songs_dataframe['Artist'].nunique()}")

Number of English songs: 191385
Number of all artists with English songs: 2488


In [16]:
# Take second third of the data (191385 / 3 = 63795 -> 63795-127590)
artists_songs_dataframe = artists_songs_dataframe[63795:127590]

# LastFM API (download tags)

Source: https://www.dataquest.io/blog/last-fm-api-python/

In [17]:
API_KEY = 'b0fc842b94c2e6a0b4458c1686a24f61'
USER_AGENT = 'idsta_project_max'

In [18]:
def lastfm_get(payload):
    # define headers and URL
    headers = {'user-agent': USER_AGENT}
    url = 'https://ws.audioscrobbler.com/2.0/'

    # Add API key and format to the payload
    payload['api_key'] = API_KEY
    payload['format'] = 'json'

    response = requests.get(url, headers=headers, params=payload)
    return response

Get top tags for all songs

In [19]:
def lookup_tags_for_artist_song(row):
    artist, song = row["Artist"], row["SName"]
    
    response = lastfm_get({
        'method': 'track.getTopTags',
        'artist':  artist,
        'track': song
    })

    # if there's an error, just return nothing
    if response.status_code != 200 or "error" in response.text:
        return "NoSongFound"

    # extract the top tags and turn them into a string
    tags = [t['name'] for t in response.json()['toptags']['tag']]
    tags_str = ', '.join(tags)
    if tags_str == "":
        tags_str ="NoTagsFound" 
    # TODO: Could later also look at track tag count

    # rate limiting
    if not getattr(response, 'from_cache', False):
        # TODO: Can I reduce the time?
        time.sleep(0.25)
    return tags_str

In [20]:
tqdm.pandas()

artists_songs_dataframe['Tags'] = artists_songs_dataframe[["Artist", "SName"]].progress_apply(lambda row: lookup_tags_for_artist_song(row), axis=1)

100%|██████████| 105/105 [01:12<00:00,  1.46it/s]


In [21]:
# Save df
artists_songs_dataframe.to_csv('data/tags_second_third.csv', index=False)

In [22]:
artists_songs_dataframe.head(50)

Unnamed: 0,Artist,SName,Tags
136797,Bauhaus,Terror Couple Kill Colonel,NoSongFound
136798,Bauhaus,The Lady in the Radiator Song,NoTagsFound
136799,Bauhaus,The Man With the X-Ray Eyes,"post-punk, Gothic, goth, 80s, new wave, goth r..."
136800,Bauhaus,The Passenger,"cover, 80s, post-punk, Gothic Rock, Iggy Pop, ..."
136801,Bauhaus,The Spy In The Cab,"post-punk, Gothic Rock, Gothic, 80s, rock, got..."
136803,Bauhaus,The Three Shadows (Part II),NoTagsFound
136804,Bauhaus,The Three Shadows (Part III),NoTagsFound
136805,Bauhaus,Too Much 21st Century,"post-punk, Gothic Rock, alternative, goth, ant..."
136806,Bauhaus,Untitled,"post-punk, Gothic, Gothic Rock, darkwave, new ..."
136807,Bauhaus,Waiting For The Man,"covers, cover, post-punk, goth, Gothic, postpu..."
