In [2]:
import os
import lyricsgenius
from dotenv import load_dotenv
import yaml
import tomllib
import numpy as np
from typing import List
import sys

root = "../../.."

sys.path.append(os.path.abspath(f"{root}/src"))
from utils import get_main_tag, shrink_genius_tag

load_dotenv()

genius = lyricsgenius.Genius()
public_api = lyricsgenius.PublicAPI()

with open(f"{root}/config.toml", "rb") as f:
    config = tomllib.load(f)

def get_song_tags(song_id: int):
    song = public_api.song(song_id)['song']
    return song['tags']

In [13]:
np.random.seed(42)
sample = np.random.permutation(int(1e6))
n_songs = 50000

for genre in config["genres"]:

    song_file = f"{root}/{config['id_data_dir']}/songs.csv"
    tag_file = f"{root}/{config['id_data_dir']}/tags.yml"

    start_index = 0
    if not os.path.exists(song_file):
        with open(song_file, "w") as f:
            f.write("id;title;artists;langauge;genre\n")
    else:
        with open(song_file, "r") as f:
            start_index = len(f.readlines()) - 1

    songs_to_save: List[dict] = []
    for i, id in enumerate(sample[start_index:]):

        song = None
        song_to_save = {}

        try:
            song = genius.song(id)["song"]
        except Exception as e:
            print(f"⚠️ No song with id {id}")
            continue
        
        # Get a list of tags from the public API
        while not song_to_save.get("tags"):
            try:
                song_to_save["tags"] = [shrink_genius_tag(tag) for tag in get_song_tags(id)]
            except Exception as e:
                print(f"❌ Failed to get tag list for song with id {id}")

        song_to_save["id"] = song["id"]
        song_to_save["title"] = song["title"]
        song_to_save["artists"] = song["artist_names"]
        song_to_save["language"] = song["language"]
        song_to_save["genre"] = get_main_tag(song_to_save["tags"])
        songs_to_save.append(song_to_save)

        # Save to file for every fifth song
        if i % 5 == 0:

            csv_strings = [f"{song['id']};{song['title']};{song['artists']};{song['language']};{song['genre']}\n" for song in songs_to_save]
            with open(song_file, "a", encoding="utf-8") as f:
                f.writelines(csv_strings)

            with open(tag_file, "a", encoding="utf-8") as f:
                yaml.safe_dump(dict([[song["id"], song["tags"]] for song in songs_to_save]), f)

            songs_to_save = []

            print(f"✅ Saved {i + 1}th song")

✅ Saved 1th song
⚠️ No song with id 567130
✅ Saved 6th song
⚠️ No song with id 731479
✅ Saved 11th song
⚠️ No song with id 401397
✅ Saved 16th song
⚠️ No song with id 577745
⚠️ No song with id 659806
⚠️ No song with id 625691
✅ Saved 26th song
⚠️ No song with id 443417
⚠️ No song with id 336983
⚠️ No song with id 518812
✅ Saved 31th song
⚠️ No song with id 677814
✅ Saved 36th song
⚠️ No song with id 66336
⚠️ No song with id 443181
⚠️ No song with id 244518
⚠️ No song with id 321941
⚠️ No song with id 684487
✅ Saved 51th song
⚠️ No song with id 507014
⚠️ No song with id 568670
⚠️ No song with id 855771
⚠️ No song with id 606372
✅ Saved 61th song
✅ Saved 66th song
✅ Saved 71th song
⚠️ No song with id 926625
⚠️ No song with id 244299
✅ Saved 81th song
⚠️ No song with id 309023
✅ Saved 86th song
⚠️ No song with id 268240
✅ Saved 91th song
⚠️ No song with id 586917
✅ Saved 96th song
⚠️ No song with id 550670
⚠️ No song with id 584291
✅ Saved 101th song
⚠️ No song with id 460373
✅ Saved 106t

KeyboardInterrupt: 