In [2]:
# Fetch tags for every song
# Take the first tag as the only tag
# Remove the song from every other list

In [2]:
import os
import lyricsgenius
from dotenv import load_dotenv
import yaml
import tomllib

load_dotenv()

with open("../../config.toml", "rb") as f:
    config = tomllib.load(f)

genius = lyricsgenius.Genius()

In [3]:

public_api = lyricsgenius.PublicAPI()

def get_song_tags(song_id: int):
    song = public_api.song(song_id)['song']
    return song['tags']

In [23]:
# Fetch the tags for the top 5 pop songs
with open(f"{config["multilabel_data_dir"]}/songs/pop.yml", "r") as f:
    pop_songs = yaml.safe_load(f)  
pop_songs = pop_songs[:10]

song = genius.search_song(pop_songs[9]["title"])
tags = get_song_tags(song.id)
[(tag["name"], "primary" if tag["primary"] else "secondary") for tag in tags]

Searching for "Work"...
Done.


[('Duet', 'secondary'),
 ('Reggae', 'secondary'),
 ('Dance-Pop', 'secondary'),
 ('Dance', 'secondary'),
 ('Pop', 'primary'),
 ('Jamaica', 'secondary'),
 ('R&B', 'primary'),
 ('Pop-R&B', 'secondary'),
 ('Contemporary R&B', 'secondary'),
 ('Pop Rap', 'secondary'),
 ('Jamaican Patois', 'secondary'),
 ('Dancehall', 'secondary')]

How do we decide on which tag is the main tag for a given song? From a close look at a few song we can see that the last tags seem to be the most relevant while the first tags are increasingly more specialised.
Therefore a good candidate for the main tags seems to be the first tag in our list of defined main tags, beginning from the end of the tag list.

In [24]:
from typing import List

# Get the song id and main genre for every song
for genre in config["genres"]:

    multilabel_path = f"{config['multilabel_data_dir']}/songs/{genre}.yml"
    genres_path = f"{config['multilabel_data_dir']}/song-genres/{genre}.yml"

    # Copy song entries from the multilabel yaml to the multiclass yaml if the current genre 
    # is their main genre
    with open(multilabel_path, "r") as f:
        songs = yaml.safe_load(f)

    # Read the existing file if it exists to check where to continue
    genre_dicts = []
    if os.path.exists(genres_path):
        with open(genres_path, "r") as f:
            genre_dicts = yaml.safe_load(f)
    start_index = len(genre_dicts)

    # Get the song's tags
    for i, song in enumerate(songs[start_index:]):
        
        song_id = None
        tags = []
        tag_found = False

        # Try to get the songs id, repeating on time out
        while not song_id:
            try:
                searched_song = genius.search_song(song["title"])

                # Songs with no lyrics will not yield a song id for some reason
                # In this case, None is returned  
                if not searched_song:
                    # These values should later be removed from the data
                    # For now "classical-music" is set as a default estimate
                    # for songs with no lyrics
                    song_id = -1
                    tags = [{"name": "classical-music"}]
                else: 
                    song_id = searched_song.id
            except Exception as e:
                print(f"❌ Failed to get song id for {song['title_with_artists']}")
        genre_dict = {"id": None, "genre": None}
        genre_dict["id"] = song_id

        # Try to get the tags, repearting on time out
        while not tags: 
            try:    
                tags = get_song_tags(song_id)    
                tags = [{"id": tag["id"], "name": tag["name"], "primary": tag["primary"]} for tag in tags]
            except Exception as e:
                print(f"❌ Failed to get tags for {song['title_with_artists']}")

        # Get the main tag
        tags = [tag["name"].lower().replace("&", "-") for tag in tags]
        tags: List[str]
        for tag in reversed(tags):
            if tag in config["genres"]:
                main_tag = tag
                tag_found = True
                break

        # If none of the genres matched, see if any of the secondary genre
        # strings contain a primary genre
        if not tag_found:
            for tag in reversed(tags):
                for g in config["genres"]:
                    if g in tag:
                        main_tag = g
                        tag_found = True
            
        # If there is still no match: set the genre to an empty string
        if not tag_found:
            main_tag = ""
        
        genre_dict["genre"] = main_tag
        genre_dicts.append(genre_dict)

        # Save the song_id and tags for the multilabel songs
        with open(genres_path, "a") as f:
            yaml.safe_dump([genre_dict], f)
        print(f"{(i + start_index):04d} ✅ Saved id and genre for '{song['title']}'")

    # # Move the data from the genre_dicts to the song list
    # songs = [song.update(genre_dicts[i]) for i, song in enumerate(songs)]
    # with open(multilabel_path, "w") as f:
    #     yaml.safe_dump(songs, f)
    #     print(f"Successfully saved ids and genres for {genre} songs")

Searching for "Masks"...
Done.
0962 ✅ Saved id and genre for 'Masks'
Searching for "Your New Cuckoo"...
Done.
0963 ✅ Saved id and genre for 'Your New Cuckoo'
Searching for "I See Love"...
Done.
0964 ✅ Saved id and genre for 'I See Love'
Searching for "Ramblin' On My Mind (Take 2)"...
Done.
0965 ✅ Saved id and genre for 'Ramblin' On My Mind (Take 2)'
Searching for "Feel Like Going Home"...
Done.
0966 ✅ Saved id and genre for 'Feel Like Going Home'
Searching for "Rock Me"...
Done.
0967 ✅ Saved id and genre for 'Rock Me'
Searching for "Baby, Please Don't Go"...
Done.
0968 ✅ Saved id and genre for 'Baby, Please Don't Go'
Searching for "Pay Day"...
Done.
0969 ✅ Saved id and genre for 'Pay Day'
Searching for "Hey Baby (New Rising Sun)"...
Done.
0970 ✅ Saved id and genre for 'Hey Baby (New Rising Sun)'
Searching for "Crawlin' King Snake"...
Done.
0971 ✅ Saved id and genre for 'Crawlin' King Snake'
Searching for "Iron Man"...
Done.
0972 ✅ Saved id and genre for 'Iron Man'
Searching for "Rattle

In [5]:
# Merge the files in data/song-genres with the files in data/songs 
for genre in config["genres"]:

    genre_dir = f"{config['multilabel_data_dir']}/song-genres"
    song_file = f"{config['multilabel_data_dir']}/songs/{genre}.yml"
    genre_file = f"{genre_dir}/{genre}.yml"

    # Load the songs from data/songs
    with open(song_file, "r") as f:
        songs = yaml.safe_load(f)

    # Load the corresponding genres
    with open(genre_file, "r") as f:
        genre_dicts = yaml.safe_load(f)

    # Merge them
    assert len(genre_dicts) == len(songs)
    for i, song in enumerate(songs):
        song = song.update(genre_dicts[i])

    # Overwrite the old song files
    with open(song_file, "w") as f:
        yaml.safe_dump(songs, f)

    # Delete the genre file
    os.remove(genre_file)

# Remove the genre dir
os.removedirs(genre_dir)

In [None]:
# Copy the song entry to the multiclass list if the main tag matches the
# current genre
if genre == main_tag:
    multiclass_songs.append(song)
    with open(output_path, "w") as f:
        yaml.safe_dump(multiclass_songs, f)
    print(f"{i:04d} ✅ Copied '{song['title']}' to multiclass {genre} songs")
else:
    print(f"{i:04d} ✅ '{song['title']}' does not have {genre} as its main genre")

In [None]:
# Download lyrics for all the songs

for genre in config["genres"]:

    # Create an output folder
    if not os.path.exists(f"{config["multiclass_data_dir"]}/lyrics/{genre}"):
        os.mkdir(f"{config["multiclass_data_dir"]}/lyrics/{genre}")

    # Get the song list for one genre
    with open(f"{config["multiclass_data_dir"]}/songs/{genre}.yml", "r") as f:
        songs = yaml.safe_load(f)

    # The API call for lyrics often times out
    # We repeat it every time it has timed out
    error = True
    while error:
        error = False 

        try:
            # Check the already saved songs
            saved_songs = os.listdir(config["multiclass_data_dir"] + "/lyrics/" + genre)
            n_saved_songs = 0 if len(saved_songs) == 0 else max([int(song[:4]) for song in saved_songs])

            # Download the song lyrics
            for i, song in enumerate(songs[n_saved_songs:]):
                title, url = song
                title: str

                # A Page "Behind The Lyrics" appeared in the country songs
                # This is not a song and is filtered out
                if title.startswith("Behind The Lyrics"):
                    continue

                # The title sometimes has invalid chars for a file name
                # Remove these here
                scary_chars = "\"/?*|"
                for char in scary_chars:
                    title = title.replace(char, "")
                if len(title) > 150:
                    title = title[:150]

                lyrics: str = genius.lyrics(song_url=url)

                # The downlaoded lyrics have a header in the first line and
                # a number + "Embed" on the last line
                # These need to be removed
                lyrics = "\n".join(lyrics.splitlines()[1:])[:-5]
                
                # Skip emtpy lyrics
                if len(lyrics) == 0:
                    continue

                # Remove "Embed" + number from the end
                if lyrics[-1] == 'K':
                    lyrics = lyrics[:-3]
                while lyrics[-1].isdigit():
                    lyrics = lyrics[:-1]

                with open(f"{config["multiclass_data_dir"]}/lyrics/{genre}/{(n_saved_songs + i):04d}_{title}.txt", "w", encoding="utf-8") as f:
                    f.write(lyrics)
                print(f"✅ Saved {n_saved_songs + i}: {title}")
        except Exception as e:
            print(f"❌ {e}")
            error = True