In [2]:
import os
import lyricsgenius
from dotenv import load_dotenv
import yaml
import tomllib
import numpy as np
from typing import List
import sys

root = "../../.."

sys.path.append(os.path.abspath(f"{root}/src"))
from utils import get_main_tag, shrink_genius_tag

load_dotenv()

genius = lyricsgenius.Genius()
public_api = lyricsgenius.PublicAPI()

with open(f"{root}/config.toml", "rb") as f:
    config = tomllib.load(f)

def get_song_tags(song_id: int):
    song = public_api.song(song_id)['song']
    return song['tags']

In [25]:
np.random.seed(42)
sample = np.random.permutation(int(1e6))
n_songs = 50000

for genre in config["genres"]:

    song_file = f"{root}/{config['id_data_dir']}/songs.csv"
    tag_file = f"{root}/{config['id_data_dir']}/tags.yml"

    start_index = 0
    if not os.path.exists(song_file):
        with open(song_file, "w") as f:
            f.write("id;title;artists;langauge;genre\n")
    else:
        with open(song_file, "r", encoding="utf-8") as f:
            start_index = len(f.readlines()) - 1

    songs_to_save: List[dict] = []
    for i, id in enumerate(sample[start_index:]):

        song = None
        song_to_save = {}

        try:
            song = genius.song(id)["song"]
        except Exception as e:
            print(f"⚠️ No song with id {id}")
            continue
        
        # Get a list of tags from the public API
        while not song_to_save.get("tags"):
            try:
                song_to_save["tags"] = [shrink_genius_tag(tag) for tag in get_song_tags(id)]
            except Exception as e:
                print(f"❌ Failed to get tag list for song with id {id}")

        song_to_save["id"] = song["id"]
        song_to_save["title"] = song["title"]
        song_to_save["artists"] = song["artist_names"]
        song_to_save["language"] = song["language"]
        song_to_save["genre"] = get_main_tag(song_to_save["tags"])
        songs_to_save.append(song_to_save)

        # Save to file for every fifth song
        if i % 5 == 0:

            csv_strings = [f"{song['id']};{song['title']};{song['artists']};{song['language']};{song['genre']}\n" for song in songs_to_save]
            with open(song_file, "a", encoding="utf-8") as f:
                f.writelines(csv_strings)

            with open(tag_file, "a", encoding="utf-8") as f:
                yaml.safe_dump(dict([[song["id"], song["tags"]] for song in songs_to_save]), f)

            songs_to_save = []

            print(f"✅ Saved {start_index + i + 1}th song")

⚠️ No song with id 256147
⚠️ No song with id 69254
⚠️ No song with id 678875
⚠️ No song with id 275405
✅ Saved 125th song
⚠️ No song with id 265973
✅ Saved 130th song
⚠️ No song with id 565510
⚠️ No song with id 103660
✅ Saved 135th song
⚠️ No song with id 331227
⚠️ No song with id 606357
✅ Saved 140th song
⚠️ No song with id 246895
✅ Saved 145th song
⚠️ No song with id 314674
⚠️ No song with id 582946
✅ Saved 150th song
⚠️ No song with id 673532
⚠️ No song with id 733810
⚠️ No song with id 570266
⚠️ No song with id 711524
⚠️ No song with id 430193
✅ Saved 165th song
⚠️ No song with id 123551
✅ Saved 170th song
⚠️ No song with id 348666
⚠️ No song with id 344474
✅ Saved 175th song
⚠️ No song with id 531787
⚠️ No song with id 556418
⚠️ No song with id 25768
⚠️ No song with id 470128
✅ Saved 185th song
⚠️ No song with id 694206
✅ Saved 190th song
⚠️ No song with id 303097
⚠️ No song with id 727932
✅ Saved 195th song
✅ Saved 200th song
⚠️ No song with id 265704
⚠️ No song with id 198493
✅

In [22]:
import pandas as pd

df = pd.read_csv(song_file, sep=';', index_col="id")
df.groupby(by="genre").size()

genre
pop     33
r-b      1
rap     47
rock    17
dtype: int64

In [None]:
# Download lyrics for all the songs
for genre in config["genres"]:

    lyric_dir = f"{root}/{config['id_data_dir']}/lyrics"

    # Create an output folder
    if not os.path.exists(lyric_dir):
        os.mkdir(lyric_dir)

    # Get the song list for one genre
    song_ids = list(pd.read_csv(song_file, sep=';')["id"])

    # Check how many lyrics have already been downloaded
    start_index = os.listdir(lyric_dir)

    # Download the song lyrics
    for song_id in song_ids[start_index:]:
        lyrics: str = genius.lyrics(song_id)

        # The downlaoded lyrics have a header in the first line and
        # a number + "Embed" on the last line
        # These need to be removed
        lyrics = "\n".join(lyrics.splitlines()[1:])[:-5]

        # Remove "Embed" + number from the end
        if lyrics[-1] == 'K':
            lyrics = lyrics[:-3]
        while lyrics[-1].isdigit():
            lyrics = lyrics[:-1]

        # Save the lyrics file with the id as the name
        with open(f"{root}/{config['id_data_dir']}/lyrics/{(song_id):07d}.txt", "w", encoding="utf-8") as f:
            f.write(lyrics)

        print(f"✅ Saved song {song_id}")