In [1]:
import os
import lyricsgenius
from dotenv import load_dotenv
import yaml
import tomllib
import numpy as np
from typing import List
import sys

root = "../../.."

sys.path.append(os.path.abspath(f"{root}/src"))
from utils import get_main_tag, shrink_genius_tag

load_dotenv()

genius = lyricsgenius.Genius()
public_api = lyricsgenius.PublicAPI()

with open(f"{root}/config.toml", "rb") as f:
    config = tomllib.load(f)

def get_song_tags(song_id: int):
    song = public_api.song(song_id)['song']
    return song['tags']

In [5]:
np.random.seed(42)
sample = np.random.permutation(int(1e6))
n_songs = 50000

for genre in config["genres"]:

    song_file = f"{root}/{config['id_data_dir']}/songs.csv"
    tag_file = f"{root}/{config['id_data_dir']}/tags.yml"

    start_index = 0
    if not os.path.exists(song_file):
        with open(song_file, "w") as f:
            f.write("id;title;artists;langauge;genre\n")
    else:
        with open(song_file, "r", encoding="utf-8") as f:
            start_index = len(f.readlines()) - 1

    songs_to_save: List[dict] = []
    for i, id in enumerate(sample[start_index:]):

        song = None
        song_to_save = {}

        try:
            song = genius.song(id)["song"]
        except Exception as e:
            print(f"⚠️ No song with id {id}")
            continue
        
        # Get a list of tags from the public API
        while not song_to_save.get("tags"):
            try:
                song_to_save["tags"] = [shrink_genius_tag(tag) for tag in get_song_tags(id)]
            except Exception as e:
                print(f"❌ Failed to get tag list for song with id {id}")

        song_to_save["id"] = song["id"]
        song_to_save["title"] = song["title"]
        song_to_save["artists"] = song["artist_names"]
        song_to_save["language"] = song["language"]
        song_to_save["genre"] = get_main_tag(song_to_save["tags"])
        songs_to_save.append(song_to_save)

        # Save to file for every fifth song
        if i % 5 == 0:
            
            csv_strings = []
            for song in songs_to_save:
                csv_values: List[str] = [str(song["id"]), song["title"], song["artists"], song["language"], song["genre"]]
                csv_values = [value.replace(';', ':') for value in csv_values]
                csv_string = ';'.join(csv_values) + '\n'
                csv_strings.append(csv_string)

            with open(song_file, "a", encoding="utf-8") as f:
                f.writelines(csv_strings)

            with open(tag_file, "a", encoding="utf-8") as f:
                yaml.safe_dump(dict([[song["id"], song["tags"]] for song in songs_to_save]), f)

            songs_to_save = []

            print(f"✅ Saved {start_index + i + 1}th song")

✅ Saved 858th song
⚠️ No song with id 469957
✅ Saved 863th song
⚠️ No song with id 644497
✅ Saved 868th song
✅ Saved 873th song
⚠️ No song with id 659639
⚠️ No song with id 561974


AttributeError: 'NoneType' object has no attribute 'replace'

In [6]:
import pandas as pd

df = pd.read_csv(song_file, sep=';', index_col="id")

# Somehow there are duplicate lines in the csv file
# these are cleared here
df = df.drop_duplicates()
df.to_csv(song_file, sep=';')

df.groupby(by="genre").size()

genre
country        21
electronic      1
folk            1
pop           279
r-b            25
rap           338
rock          122
dtype: int64

In [7]:
# Download lyrics for all the songs
for genre in config["genres"]:

    lyric_dir = f"{root}/{config['id_data_dir']}/lyrics"

    # Create an output folder
    if not os.path.exists(lyric_dir):
        os.mkdir(lyric_dir)

    # Get the song list for one genre
    song_ids = list(pd.read_csv(song_file, sep=';')["id"])

    # Check how many lyrics have already been downloaded
    start_index = len(os.listdir(lyric_dir))

    # Download the song lyrics
    for i, song_id in enumerate(song_ids[start_index:]):

        lyrics = ""
        while not lyrics:
            try:
                lyrics: str = genius.lyrics(song_id)
            except Exception as e:
                print(f"❌ Failed to get lyrics for song {song_id}")

        # The downlaoded lyrics have a header in the first line and
        # a number + "Embed" on the last line
        # These need to be removed
        lyrics = "\n".join(lyrics.splitlines()[1:])[:-5]

        if not len(lyrics) == 0:
            # Remove "Embed" + number from the end
            if lyrics[-1] == 'K':
                lyrics = lyrics[:-3]
            while lyrics[-1].isdigit():
                lyrics = lyrics[:-1]

        # Save the lyrics file with the id as the name
        with open(f"{root}/{config['id_data_dir']}/lyrics/{(int(song_id)):07d}.txt", "w", encoding="utf-8") as f:
            f.write(lyrics)

        print(f"✅ Saved lyrics for song {song_id} ({start_index + i + 1}/{len(song_ids)})")

✅ Saved lyrics for song 576649 (282/871)
✅ Saved lyrics for song 857348 (283/871)
✅ Saved lyrics for song 13996 (284/871)
✅ Saved lyrics for song 18042 (285/871)
✅ Saved lyrics for song 3918 (286/871)
✅ Saved lyrics for song 14062 (287/871)
✅ Saved lyrics for song 817044 (288/871)
✅ Saved lyrics for song 809181 (289/871)
✅ Saved lyrics for song 403044 (290/871)
✅ Saved lyrics for song 384262 (291/871)
✅ Saved lyrics for song 249054 (292/871)
✅ Saved lyrics for song 67298 (293/871)
✅ Saved lyrics for song 339778 (294/871)
✅ Saved lyrics for song 853198 (295/871)
✅ Saved lyrics for song 171973 (296/871)
✅ Saved lyrics for song 330756 (297/871)
✅ Saved lyrics for song 313679 (298/871)
✅ Saved lyrics for song 81095 (299/871)
✅ Saved lyrics for song 82748 (300/871)
✅ Saved lyrics for song 139686 (301/871)
✅ Saved lyrics for song 489619 (302/871)
✅ Saved lyrics for song 506531 (303/871)
✅ Saved lyrics for song 779454 (304/871)
✅ Saved lyrics for song 183513 (305/871)
✅ Saved lyrics for song 

In [None]:
# TODO: wenig besetzte genres auffüllen