### In the first phase of the script we will form a subset of the complete dataset:
- Prune genres that occur very seldom or that cannot be mapped to a more common genre
- Detect non-English songs
- Prune artists that has less than n songs and less than m non-English songs

In [3]:
# start by feeding your Pickle functions to call and save pickle variables later on

import pickle
def writePickle( Variable, fname):
    filename = fname +".pkl"
    f = open("pickle_vars/"+filename, 'wb')
    pickle.dump(Variable, f)
    f.close()
def readPickle(fname):
    filename = "pickle_vars/"+fname +".pkl"
    f = open(filename, 'rb')
    obj = pickle.load(f)
    f.close()
    return obj
def readPicklefromPast(fname):
    filename = "../pickle_vars/"+fname +".pkl"
    f = open(filename, 'rb')
    obj = pickle.load(f)
    f.close()
    return obj

The following is the whole dataset

In [None]:
lyrics_dict = readPicklefromPast('lyrics_dict')

Then get some of the necessary information existing in the whole dataset and group them in several dictionaries that consist of artists as the keys, and the information pieces as the values. Record these dictionaries as pickle files

In [None]:
artist_to_lyrics = dict() # a dictionary mapping each artist to a list containing her songs
artist_to_genre = dict() # a dictionary mapping each artist to the genre that is attributed to her the most
artist_to_numberofsongs = dict() # a dictionary mapping the artist to the number of collected songs belonging to her
artist_to_numberofalbums = dict() # a dictionary mapping the artist to the number of collected albums belonging to her
artist_to_years = dict() # a dictionary mapping each artist to a list containing the years of each song that is collected from the website

for artist, content in lyrics_dict.items():
    artist_to_lyrics[artist] = []
    artist_to_years[artist] = []
    artist_to_genre[artist] = content[0]
    artist_to_numberofsongs[artist] = content[-1]
    artist_to_numberofalbums[artist] = content[-2]
    for album in content[2].values():
        for song in album[1].values():
            artist_to_lyrics[artist].append(song[0])
            artist_to_years[artist].append(song[1])

In [None]:
# write all the subset dictionaries into pickle files
writePickle(artist_to_lyrics, 'artist_to_lyrics')
writePickle(artist_to_genre, 'artist_to_genre')
writePickle(artist_to_numberofsongs, 'artist_to_numberofsongs')
writePickle(artist_to_numberofalbums, 'artist_to_numberofalbums')
writePickle(artist_to_years, 'artist_to_years')

In [None]:
song_count = 0
for artist in artist_to_lyrics.keys():
    song_count += artist_to_numberofsongs[artist]
print("Now there are", song_count, "songs from", len(list(artist_to_lyrics.keys())), "artists")

Now there are 559987 songs from 6352 artists

Continue with pruning some of the artists that belong to genres that does not occur frequently in the dataset

In [None]:
import numpy as np

unique_genres = list(set(artist_to_genre.values()))
genre_to_numberofsongs = dict()
genre_to_numberofalbums = dict()
genre_to_numberofartists = dict()
for genre in unique_genres:
    genre_to_numberofsongs[genre] = 0
    genre_to_numberofalbums[genre] = 0
    genre_to_numberofartists[genre] = 0

for artist, genre in artist_to_genre.items():
    genre_to_numberofartists[genre] += 1
    genre_to_numberofsongs[genre] += artist_to_numberofsongs[artist]
    genre_to_numberofalbums[genre] += artist_to_numberofalbums[artist]
    
# get the dictionaries sorted:
sorted_artists = sorted(genre_to_numberofartists.items(), key=lambda kv: kv[1])
sorted_songs = sorted(genre_to_numberofsongs.items(), key=lambda kv: kv[1])
sorted_albums = sorted(genre_to_numberofalbums.items(), key=lambda kv: kv[1])

In [None]:
import collections

print("There are in total", len(list(collections.OrderedDict(sorted_artists).items())), "different genres detected")

Before pruning, there are 440 different genres detected.

In [None]:
# partition all genre categories into genres that occur less than 11 times, and those that occur more than 10 times
insufficient_genres = []
sufficient_genres = []
for item in art:
    if item[1] < 11:
        insufficient_genres.append(item[0])
    else:
        sufficient_genres.append(item[0])

print("There are",len(sufficient_genres), "genres that occur at least 11 times in the whole dataset. \
These genres will be mapped into more comprehensive genre classes, or will be discarded in cases where the genre \
does not comply with our interests")
print("There are",len(insufficient_genres),"genres that occur less than 11 times in the whole dataset. \
These will be discarded")

There are 40 genres that occur at least 11 times in the whole dataset. These genres will be mapped into more comprehensive genre classes, or will be discarded in cases where the genre does not comply with our interests
There are 400 genres that occur less than 11 times in the whole dataset. These will be discarded

In [None]:
# use a mapping dictionary to map each genre to one of the desired genre classes
genre_mapping = {"Folk Rock" : "Rock", "Soft Rock" : "Rock", "Britpop": "Pop", "Black Metal": "Metal", \
                 "Classic Rock": "Rock",  'Dance-Pop' : 'Pop', 'Electro': 'Electronic', "Post-Rock": "Rock", \
                "Power Pop" : "Pop", "Rock 'N' Roll" : "Rock", 'Psychedelic Rock': "Rock",'Thrash Metal':"Metal",\
                 'American Folk': 'Folk', 'Dream Pop': 'Pop', 'Art Rock':'Rock', 'Reggae': 'Reggae',\
                 'Indie Folk': 'Folk', 'Jazz': "Jazz & Blues", 'Death Metal': "Metal", 'Progressive Rock': 'Rock',\
                 'Pop Rock': 'Pop', 'Electronic':'Electronic', 'Hard Rock':'Rock', 'R&B': 'R&B', 'Indie Pop':'Pop',\
                 'Heavy Metal':'Metal', 'Blues': "Jazz & Blues", 'Folk': "Folk", 'Country': 'Country', \
                 'Indie Rock':'Rock', 'Hip Hop': 'Hip Hop & Rap', 'Alternative Rock':'Rock', 'Rock':'Rock', 'Pop':'Pop' }

# some of the genres will be eliminated since they don't comply with our initial target genre categories
uncategorized_genres = ['New Age', "Ska Punk", "Beat", 'Trip Hop', "Experimental", "Funk"]

In [None]:
genre_pruned_artist_list = [] # this list will later on will be used to discard the artists 
                              # that belong to genres that occur very rarely or not classified
for artist, genre in artist_to_genre.items():
    if genre in insufficient_genres or genre in uncategorized_genres:
        genre_pruned_artist_list.append(artist)
    
print("There are", len(genre_pruned_artist_list), "artists that will be pruned due to belonging to insufficient genres")

There are 983 artists that will be pruned due to belonging to insufficient genres

Now get all the songs in a dictionary by their ids if the artist of that song is not pruned by genre

In [None]:
song_id_dict = dict()
for artist in lyrics_dict.keys():
    if artist not in genre_pruned_artist_list:
        for value in lyrics_dict[artist][2].values():
            for song in value[1].values():
                lyrics = song[0]
                s_id = song[3]
                song_id_dict[s_id] = lyrics
    else:
        continue

Now there are 460498 songs that belong to any desired genre

In [None]:
# see an example of a sample song lyric
song_id_dict['200']

In [None]:
# write the dictionary that maps song_ids to lyrics into a pickle file for later use
writePickle(song_id_dict, "song_id_dict")

Use langdetect tool to spot songs that are not in English

In [None]:
non_english_song_ids = []
from langdetect import detect
for song_id, lyrics in song_id_dict.items():
    try:
        lang = detect(lyrics)
        if lang != 'en':
            non_english_song_ids.append(song_id)
    except:
        non_english_song_ids.append(song_id)

Out of a total of 460498 songs, 98278 are not classified as in English

Also create a dictionary where keys are artists and values are a list of song ids that belong to each artist. Then using the non-English song ids list, find out how many songs of those artists are actually in English. If that value is below a certain threshold, remove that artist from the initial dataset, along with other non English songs.

In [None]:
artist_to_song_id = dict()
for artist in lyrics_dict:
    artist_to_song_id[artist] = []
    if artist not in genre_pruned_artist_list:
        for value in lyrics_dict[artist][2].values():
            for song in value[1].values():
                s_id = song[3]
                artist_to_song_id[artist].append(s_id)
                
    else:
        continue

In [None]:
artists_and_english_song_ids = dict()
replica = dict(artist_to_song_id)
for artist, song_id_list in replica.items():
    if len(song_id_list) < 150: # we first prune the artists that have less then n songs anyways
        del artist_to_song_id[artist]
        continue
    else:
        number_checker = []
        for s_id in song_id_list:
            if s_id not in non_english_song_ids:  
                number_checker.append(s_id)
        if len(number_checker) >= 150: # then we prune others that have less than m English songs
            artists_and_english_song_ids[artist] = number_checker

After removing the non-English songs and artists, there are 147470 songs and 544 artists left.

In [None]:
# write the pickle variables that are created in these steps
writePickle(genre_pruned_artist_list, "genre_pruned_artist_list")
writePickle(non_english_song_ids, "non_english_song_ids")
writePickle(artist_to_song_id, "artist_to_song_id") # this one is for artists that have total songs above a threshold
writePickle(artists_and_english_song_ids, "artists_and_english_song_ids") # this one is for artists that have total English songs above a threshold

### In the second phase, we will check each artist with her English song ids, and prune those artist that does not comply with several song length criteria. Some of the criteria are:
- The number of lines in a song should not be more than m and less than l
- The maximum number of tokens in a line should not be more than k

In [None]:
# load the previously formed pickle variable that maps each song id to its lyrics
ids_to_lyrics_dict = readPicklefromPast("all_songids2_lyrics")

In [None]:
# use spaCy english tokenizer and POS tagger 
import en_core_web_sm
nlp = en_core_web_sm.load()

Due to time constraints, the following calculation will be carried out in a separate python script named 'Lyrics_size_constraints.py' and the resulting dictionary will be saved to a pickle file named "size_constrained_artists_to_ids.pkl". Please refer to that script that can be found in the same folder. You can also check the following cell to see the details of the script.

In [None]:
counter = 0
size_constrained_artists_to_ids = dict()
for artist, song_list in artists_and_english_song_ids.items():
    size_constrained_artists_to_ids[artist] = list()
    for song_id in song_list:
        counter +=1
        print(counter)
        lines = ids_to_lyrics_dict[song_id].split('<>')
        # check whether the second line gives song writer info. if so, remove the first two lines
        if lines[1][0:6] == 'Writer' or lines[1][0:7] == 'Writers':
            lines = lines[2:]
        song_length = len(lines)-1
        # if the song_length is below 10 or above 100, ignore song and continue with the next one
        if song_length < 10 or song_length > 100:
            continue
        else: # if the song length satisfies our constraints, continue checking the max line length
            line_length_counter = []
            for line in lines[:-1]: # because the last line is always blank in the dataset
                doc = nlp(line)
                tokens = []
                for token in doc:
                    tokens.append(token.text)
                line_length_counter.append(len(tokens))
        if max(line_length_counter) < 21: # as long as the max line length in a song is less than 21, add the song id
            size_constrained_artists_to_ids[artist].append(song_id)



In [None]:
# alternatively import the dictionary formed in a separate script
size_constrained_artists_to_ids = readPickle("size_constrained_artists_to_ids")

Finally detect the artists with at least 150 songs in the collection, select randomly 150 of those songs per artist, and create the final selections.

In [None]:
import random
final_constrained_artist2idlist_dict = dict()
for artist, song_list in size_constrained_artists_to_ids.items():
    if len(song_list) >= 150:
        randomized = random.shuffle(song_list)
        final_constrained_artist2idlist_dict[artist] = song_list[0:150]

In [None]:
# write it to a pickle file
writePickle(final_constrained_artist2idlist_dict, "final_constrained_artist2idlist_dict")

Similarly, instead of mapping the artists to a list of song ids, map each artist to a list of her song lyrics, and store this dictionary into a pickle file

In [None]:
final_artist2lyrics_dict = dict()
for artist, song_list in final_constrained_artist2idlist_dict.items():
    lyrics_list = list()
    for song_id in song_list:
        lyrics_list.append(song_id_dict[song_id])
    final_artist2lyrics_dict[artist] = lyrics_list

writePickle(final_artist2lyrics_dict, "final_artist2lyrics_dict")