In [None]:
import requests
import time

import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import requests_cache

requests_cache.install_cache()

# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'

%matplotlib inline

In [None]:
# TODO: Add normalization of the song names (remove (feat. ...))
# TODO: Alternatively, load all songs from the artist and find out the mib to definitely find the song

# Artist & Song Dataframe

Load lyrics dataframe 

In [None]:
lyrics_df = pd.read_csv("../data/lyrics-data.csv")
artists_df = pd.read_csv('../data/artists-data.csv')

Show general information for the dataframes

In [None]:
lyrics_df.info()

In [None]:
artists_df.info()

Create dataframe with song name and artist name (only English songs)

In [None]:
artists_songs_dataframe = pd.merge(artists_df, lyrics_df, how="inner", left_on="Link", right_on="ALink")
artists_songs_dataframe = artists_songs_dataframe[artists_songs_dataframe["language"] == "en"][["Artist", "SName"]]

In [None]:
artists_songs_dataframe.head()

In [None]:
# Number of English songs
print(f"Number of English songs: {artists_songs_dataframe['SName'].count()}")

# Number of all artists with English songs
print(f"Number of all artists with English songs: {artists_songs_dataframe['Artist'].nunique()}")

# LastFM API - Scraping Tags

Based on: https://www.dataquest.io/blog/last-fm-api-python/

In [None]:
API_KEY = 'b0fc842b94c2e6a0b4458c1686a24f61'
USER_AGENT = 'idsta_project'

In [None]:
def lastfm_get(payload):
    """Send a GET request to lastfm API to receive, e.g., song tags.

    :param payload: payload for the API request (contains, e.g., 'method', 'artist', 'track').
    
    :return: Response for the API request to lastfm.
    :rtype: requests.Response
    """
    
    # define headers and URL
    headers = {'user-agent': USER_AGENT}
    url = 'https://ws.audioscrobbler.com/2.0/'

    # Add API key and format to the payload
    payload['api_key'] = API_KEY
    payload['format'] = 'json'

    response = requests.get(url, headers=headers, params=payload)
    return response

Get top tags for all songs

In [None]:
def lookup_tags_for_artist_song(row):
    """Return a tags string containing all tags for a given artist and song.

    :param row: dataframe row consisting of columns 'Artist' and 'SName' specifying the artist and song name.
    
    :return: tags for a given artist and song.
    :rtype: String
    """
    
    artist, song = row["Artist"], row["SName"]
    
    response = lastfm_get({
        'method': 'track.getTopTags',
        'artist':  artist,
        'track': song
    })

    # if there's an error, just return nothing
    if response.status_code != 200 or "error" in response.text:
        return "NoSongFound"

    # extract the top tags and turn them into a string
    # TODO: Could later also look at track tag count
    tags = [t['name'] for t in response.json()['toptags']['tag']]
    tags_str = ', '.join(tags)
    if tags_str == "":
        tags_str ="NoTagsFound" 

    # rate limiting (to not get banned by lastfm API)
    if not getattr(response, 'from_cache', False):
        time.sleep(0.25)
    return tags_str

In [None]:
# Shorten artists_songs_dataframe for testing purposes
artists_songs_dataframe = artists_songs_dataframe[artists_songs_dataframe["Artist"] == "Eminem"]
artists_songs_dataframe.head()

In [None]:
# Scrape tags from lastfm for given songs of given artists (within the artists_songs_dataframe)
tqdm.pandas()
artists_songs_dataframe['Tags'] = artists_songs_dataframe[["Artist", "SName"]].progress_apply(lambda row: lookup_tags_for_artist_song(row), axis=1)

In [None]:
artists_songs_dataframe.head(50)

# Preload Tags Dataframe

In [None]:
artists_songs_dataframe = pd.read_csv('data/tags_first_third.csv')
artists_songs_dataframe.head(20)

# Mood keyword distribution

In [None]:
MOOD_CATEGORIES = {
    'calm': ['calm', 'comfort', 'quiet', 'serene', 'mellow', 'chill out'],
    'sad': ['sadness', 'unhappy', 'melancholic', 'melancholy'],
    'happy': ['happy', 'happiness', 'happy songs', 'happy music'],
    'romantic': ['romantic', 'romantic music'],
    'upbeat': ['upbeat', 'gleeful', 'high spirits', 'zest', 'enthusiastic'],
    'depressed': ['depressed', 'blue', 'dark', 'depressive', 'dreary'],
    'anger': ['anger', 'angry', 'choleric', 'fury', 'outraged', 'rage'],
    'grief': ['grief', 'heartbreak', 'mournful', 'sorrow', 'sorry'],
    'dreamy': ['dreamy'],
    'cheerful': ['cheerful', 'cheer up', 'festive', 'jolly', 'jovial', 'merry'],
    'brooding': ['brooding', 'contemplative', 'meditative', 'reflective'],
    'aggression': ['aggression', 'aggressive'],
    'confident': ['confident', 'encouraging', 'encouragement', 'optimism'],
    'angst': ['angst', 'anxiety', 'anxious', 'jumpy', 'nervous', 'angsty'],
    'earnest': ['earnest', 'heartfelt'],
    'desire': ['desire', 'hope', 'hopeful', 'mood: hopeful'],
    'pessimism': ['pessimism', 'cynical', 'pessimistic', 'weltschmerz'],
    'excitement': ['excitement', 'exciting', 'exhilarating', 'thrill', 'ardor']
}

In [None]:
# Compute number of songs that can be assigned to a mood based on occuring keywords
num_moods = len(MOOD_CATEGORIES)
keyword_distibution = dict(zip(MOOD_CATEGORIES.keys(), [0]*num_moods))
for tag_list in artists_songs_dataframe["Tags"]:
    for mood in MOOD_CATEGORIES:
        for keyword in MOOD_CATEGORIES[mood]:
            if keyword in tag_list:                
                keyword_distibution[mood] += 1

In [None]:
# This approach only considers the first appearing mood keyword for the overall mood of the song
# --> Will assign the moods more sophisticatedly later in the project
song_distribution= dict(zip(MOOD_CATEGORIES.keys(), [0]*num_moods))

for tag_list in artists_songs_dataframe["Tags"]:
    for mood in MOOD_CATEGORIES:
        for keyword in MOOD_CATEGORIES[mood]:
            if keyword in tag_list:
                song_distribution[mood] += 1
                break


In [None]:
labels = keyword_distibution.keys()
data = keyword_distibution.values()
fig, ax = plt.subplots()
plt.xticks(rotation = 75)
ax.bar(labels, data)

ax.set_ylabel('Appearance of keywords')
plt.title('Total Keyword distribution')
plt.show()

# The figure shows how often the keywords of a mood category appear in all english texts

In [None]:
# The figure shows in how many songs the keywords of a mood category appear 
# (If for instance 2 keywords appear in 1 text, this counts as 1)",

labels = keyword_distibution.keys()
data = keyword_distibution.values()
fig, ax = plt.subplots()
plt.xticks(rotation = 75)
ax.bar(labels, data)
ax.set_ylabel('Appearance of Moods in songtexts')
plt.title('Total Mood distribution')
plt.show()

### Now plot the distribution with regards to the total ammount of songs

In [None]:
# The figure shows in how many songs the keywords of a mood category appear 
# (If for instance 2 keywords appear in 1 text, this counts as 1)",

labels = keyword_distibution.keys()
data = keyword_distibution.values()
fig, ax = plt.subplots()
plt.xticks(rotation = 75)
ax.bar(labels, data)
ax.set_ylabel('Appearance of Moods in songtexts with total ammouint of songs')
plt.title('Total Mood distribution')
plt.axhline(y = len(artists_songs_dataframe["Tags"]), color = 'r', linestyle = '-', label = "Total ammount of songs")
plt.legend(bbox_to_anchor = (1.0, 1), loc = 'upper center')
plt.show()

## Now we will calculate how many songs can be identified with a mood at all

In [None]:
songs_without_moods = 0

for tag_list in artists_songs_dataframe["Tags"]:
    flag = 0
    for mood in MOOD_CATEGORIES:
        for keyword in MOOD_CATEGORIES[mood]:
            if keyword in tag_list:
                flag = 1
                break
    if flag == 0:
        songs_without_moods += 1


In [None]:
# the following figure shows the ammount of songs not containing any of the mood keywords

labels = ["total ammount of songs", "without moods", "with moods"]
data = [ len(artists_songs_dataframe["Tags"]), songs_without_moods, len(artists_songs_dataframe["Tags"])-songs_without_moods ]
fig, ax = plt.subplots()
plt.xticks(rotation = 75)
ax.bar(labels, data)
ax.set_ylabel('Number of songs')
plt.title('Songs without moods')
plt.show()

# Mood keyword distribution (with advanced keyword set)

Concept based on: https://github.com/workmanjack/lyric-mood-classification

### Expanding the Categories

The last.fm dataset is generated by users without scientific rigor in mind. There is a chance that searching for an exact match on a mood and its related moods will not always yield all possible matches. In this section, we will explore additional tags that match with each category.

In [None]:
# TODO: Maybe add filters afterwards (e.g., for words like happy -> unhappy)

def find_tags_containing_keyword(df):
    """Return an advanced mood keyword dictionary based on our dataset and the base mood keywords (defined above).

    :param df: dataframe consisting of columns 'Artist', 'SName' and 'Tags' specifying the artist, song name and tags defined by users.
    
    :return: dictionary containing more mood keywords (values) for each mood (key) based on our dataset and the base mood keywords.
    :rtype: Dictionary
    """
    
    # Get all unique tags
    unique_tag_list = list(set([tag.lower() for tags in df["Tags"] for tag in tags.split(", ")]))
    
    # Check for each keyword of specific mood if this keyword is contained in the tags (do lower casing)
    advanced_mood_keyword_dict = {}
    for mood, keywords in MOOD_CATEGORIES.items():
        advanced_mood_keyword_dict[mood] = []
        for mood_keyword in keywords:
            for tag in unique_tag_list:
                if mood_keyword in tag:
                    advanced_mood_keyword_dict[mood].append(tag)
    
    return advanced_mood_keyword_dict

In [None]:
# TODO: Analyze mood keywords and look for missmatch (e.g., for words like happy -> unhappy)
# TODO: Create manual filters

advanced_mood_keyword_dict = find_tags_containing_keyword(artists_songs_dataframe)

print("Base mood keywords:")
print(MOOD_CATEGORIES)

print("Advanced mood keywords:")
print(advanced_mood_keyword_dict)

### Advanced mood keywords distribution

In [None]:
# Compute number of songs that can be assigned to a mood based on occuring keywords
num_moods = len(advanced_mood_keyword_dict)
keyword_distibution = dict(zip(advanced_mood_keyword_dict.keys(), [0]*num_moods))
for tag_list in artists_songs_dataframe["Tags"]:
    for mood in advanced_mood_keyword_dict:
        for keyword in advanced_mood_keyword_dict[mood]:
            if keyword in tag_list:                
                keyword_distibution[mood] += 1

In [None]:
# This approach only considers the first appearing mood keyword for the overall mood of the song
# --> Will assign the moods more sophisticatedly later in the project
song_distribution= dict(zip(advanced_mood_keyword_dict.keys(), [0]*num_moods))

for tag_list in artists_songs_dataframe["Tags"]:
    for mood in advanced_mood_keyword_dict:
        for keyword in advanced_mood_keyword_dict[mood]:
            if keyword in tag_list:
                song_distribution[mood] += 1
                break


In [None]:
labels = keyword_distibution.keys()
data = keyword_distibution.values()
fig, ax = plt.subplots()
plt.xticks(rotation = 75)
ax.bar(labels, data)

ax.set_ylabel('Appearance of keywords')
plt.title('Total Keyword distribution')
plt.show()

# The figure shows how often the keywords of a mood category appear in all english texts

In [None]:
# The figure shows in how many songs the keywords of a mood category appear 
# (If for instance 2 keywords appear in 1 text, this counts as 1)",

labels = keyword_distibution.keys()
data = keyword_distibution.values()
fig, ax = plt.subplots()
plt.xticks(rotation = 75)
ax.bar(labels, data)
ax.set_ylabel('Appearance of Moods in songtexts')
plt.title('Total Mood distribution')
plt.show()

#### Now plot the distribution with regards to the total ammount of songs

In [None]:
# The figure shows in how many songs the keywords of a mood category appear 
# (If for instance 2 keywords appear in 1 text, this counts as 1)",

labels = keyword_distibution.keys()
data = keyword_distibution.values()
fig, ax = plt.subplots()
plt.xticks(rotation = 75)
ax.bar(labels, data)
ax.set_ylabel('Appearance of Moods in songtexts with total ammouint of songs')
plt.title('Total Mood distribution')
plt.axhline(y = len(artists_songs_dataframe["Tags"]), color = 'r', linestyle = '-', label = "Total ammount of songs")
plt.legend(bbox_to_anchor = (1.0, 1), loc = 'upper center')
plt.show()

#### Now we will calculate how many songs can be identified with a mood at all

In [None]:
songs_without_moods = 0

for tag_list in artists_songs_dataframe["Tags"]:
    flag = 0
    for mood in advanced_mood_keyword_dict:
        for keyword in advanced_mood_keyword_dict[mood]:
            if keyword in tag_list:
                flag = 1
                break
    if flag == 0:
        songs_without_moods += 1


In [None]:
# the following figure shows the ammount of songs not containing any of the mood keywords

labels = ["total ammount of songs", "without moods", "with moods"]
data = [ len(artists_songs_dataframe["Tags"]), songs_without_moods, len(artists_songs_dataframe["Tags"])-songs_without_moods ]
fig, ax = plt.subplots()
plt.xticks(rotation = 75)
ax.bar(labels, data)
ax.set_ylabel('Number of songs')
plt.title('Songs without moods')
plt.show()