In [11]:
import requests
import time

import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import glob
import requests_cache

requests_cache.install_cache()

# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'

%matplotlib inline

# Artist & Song Dataframe

Load lyrics dataframe 

In [12]:
lyrics_df = pd.read_csv("../data/lyrics-data.csv")
artists_df = pd.read_csv('../data/artists-data.csv')

Show general information for the dataframes

Create dataframe with song name and artist name (only English songs)

In [13]:
artists_songs_dataframe = pd.merge(artists_df, lyrics_df, how="inner", left_on="Link", right_on="ALink")
artists_songs_dataframe = artists_songs_dataframe[artists_songs_dataframe["language"] == "en"][["Artist", "SName"]]

In [14]:
# Number of English songs
print(f"Number of English songs: {artists_songs_dataframe['SName'].count()}")

# Number of all artists with English songs
print(f"Number of all artists with English songs: {artists_songs_dataframe['Artist'].nunique()}")

Number of English songs: 191385
Number of all artists with English songs: 2488


In [15]:
# Take first third of the data (191385 / 3 = 63795)
artists_songs_dataframe = artists_songs_dataframe[:63795]

# LastFM API (download tags)

Source: https://www.dataquest.io/blog/last-fm-api-python/

In [16]:
API_KEY = 'b0fc842b94c2e6a0b4458c1686a24f61'
USER_AGENT = 'idsta_project'

In [17]:
def lastfm_get(payload):
    """Send a GET request to lastfm API to receive, e.g., song tags.

    :param payload: payload for the API request (contains, e.g., 'method', 'artist', 'track').
    
    :return: Response for the API request to lastfm.
    :rtype: requests.Response
    """
    
    # define headers and URL
    headers = {'user-agent': USER_AGENT}
    url = 'https://ws.audioscrobbler.com/2.0/'

    # Add API key and format to the payload
    payload['api_key'] = API_KEY
    payload['format'] = 'json'

    response = requests.get(url, headers=headers, params=payload)
    return response

Get top tags for all songs

In [18]:
def lookup_tags_for_artist_song(row):
    """Return a tags string containing all tags for a given artist and song.

    :param row: dataframe row consisting of columns 'Artist' and 'SName' specifying the artist and song name.
    
    :return: tags for a given artist and song.
    :rtype: String
    """
    
    artist, song = row["Artist"], row["SName"]
    
    response = lastfm_get({
        'method': 'track.getTopTags',
        'artist':  artist,
        'track': song
    })

    # if there's an error, just return nothing
    if response.status_code != 200 or "error" in response.text:
        return "NoSongFound"

    # extract the top tags and turn them into a string
    tags = [t['name'] for t in response.json()['toptags']['tag']]
    tags_str = ', '.join(tags)
    if tags_str == "":
        tags_str ="NoTagsFound" 
    # TODO: Could later also look at track tag count

    # rate limiting
    if not getattr(response, 'from_cache', False):
        # TODO: Can I reduce the time?
        time.sleep(0.25)
    return tags_str

In [19]:
tqdm.pandas()

artists_songs_dataframe['Tags'] = artists_songs_dataframe[["Artist", "SName"]].progress_apply(lambda row: lookup_tags_for_artist_song(row), axis=1)

  0%|          | 39/63795 [00:05<2:35:17,  6.84it/s]

Lost connection. Will try to reconnect in 30 seconds again.





UnboundLocalError: local variable 'connection_reset_counter' referenced before assignment

In [None]:
# Save df
file_name = '../data/tags.csv'
file_present = files_present = glob.glob(file_name)
if not file_present:
    artists_songs_dataframe.to_csv(file_name, index=False)
else:
    print("File already exist. Use different name or delete file manually first.")

In [None]:
artists_songs_dataframe = pd.read_csv('../data/tags.csv')
artists_songs_dataframe.head(20)