In [1]:
import os
import lyricsgenius
from dotenv import load_dotenv
import yaml
import tomllib
import numpy as np
from typing import List
import sys
import pandas as pd


root = "../../.."

sys.path.append(os.path.abspath(f"{root}/src"))
from utils import get_main_tag, shrink_genius_tag, song_to_csv

load_dotenv()

genius = lyricsgenius.Genius()
public_api = lyricsgenius.PublicAPI()

with open(f"{root}/config.toml", "rb") as f:
    config = tomllib.load(f)

def get_song_tags(song_id: int):
    song = public_api.song(song_id)['song']
    return song['tags']

np.random.seed(42)

In [2]:
# Prune song list for 850 song x 5 genres
main_tags_file = f"{root}/{config['pruned_data_dir']}/main_tags_songs.csv"
main_tags = ["pop", "rock", "r-b", "country", "rap"]
song_file = f"{root}/{config['id_data_dir']}/songs.csv"

n_songs = 850

df = pd.read_csv(song_file, sep=';')

main_df = df[df["genre"].isin(main_tags)]

indexes = np.hstack([np.random.choice(v, n_songs, replace=False) for v in main_df.groupby("genre").groups.values()])
main_df: pd.DataFrame = df.iloc[indexes]

print(main_df.groupby("genre").size(), f"\ntotal songs: {len(main_df)}")

main_df.to_csv(main_tags_file, sep=";", index=False)

genre
country    850
pop        850
r-b        850
rap        850
rock       850
dtype: int64 
total songs: 4250


In [3]:
# Prune song list for 150 song x 10 genres
all_tags_file = f"{root}/{config['pruned_data_dir']}/all_tags_songs.csv"
all_tags = config["genres"].copy()
all_tags.remove("experimental")
n_songs = 150

df = pd.read_csv(song_file, sep=';')

full_df = df[df["genre"].isin(all_tags)]

indexes = np.hstack([np.random.choice(v, n_songs, replace=False) for v in full_df.groupby("genre").groups.values()])
full_df: pd.DataFrame = df.iloc[indexes]

print(full_df.groupby("genre").size(), f"\ntotal songs: {len(full_df)}")

full_df.to_csv(all_tags_file, sep=";", index=False)

genre
blues         150
country       150
electronic    150
folk          150
jazz          150
pop           150
r-b           150
rap           150
rock          150
dtype: int64 
total songs: 1350


In [6]:
# Download lyrics for the pruned songs
lyric_dir = f"{root}/{config['pruned_data_dir']}/lyrics"
pruned_song_files = [
    f"{root}/{config['pruned_data_dir']}/all_tags_songs.csv",
    f"{root}/{config['pruned_data_dir']}/main_tags_songs.csv"
]
index_file = "06-index.txt"

# Create an output folder
if not os.path.exists(lyric_dir):
    os.mkdir(lyric_dir)

# Get the song ids for both files
song_ids = []
for song_file in pruned_song_files:
    song_ids.extend(list(pd.read_csv(song_file, sep=';')["id"]))

# Check how many lyrics have already been downloaded
start_index = 0
if os.path.exists(index_file):
    with open(index_file, "r") as f:
        start_index = int(f.read())

# Download the song lyrics
for i, song_id in enumerate(song_ids[start_index:]):

    lyrics = ""
    while not lyrics:
        try:
            lyrics: str = genius.lyrics(song_id)
        except Exception as e:
            print(f"❌ Failed to get lyrics for song {song_id}")

    # The downlaoded lyrics have a header in the first line and
    # a number + "Embed" on the last line
    # These need to be removed
    lyrics = "\n".join(lyrics.splitlines()[1:])[:-5]

    if not len(lyrics) == 0:
        # Remove "Embed" + number from the end
        if lyrics[-1] == 'K':
            lyrics = lyrics[:-3]
        while lyrics[-1].isdigit():
            lyrics = lyrics[:-1]

    # Save the lyrics file with the id as the name
    with open(f"{lyric_dir}/{(int(song_id)):07d}.txt", "w", encoding="utf-8") as f:
        f.write(lyrics)

    # Save the index file
    with open(index_file, "w") as f:
        f.write(str(start_index + i + 1))

    print(f"✅ Saved lyrics for song {song_id} ({start_index + i + 1}/{len(song_ids)})")

✅ Saved lyrics for song 1522030 (1424/5600)
✅ Saved lyrics for song 923243 (1425/5600)
✅ Saved lyrics for song 228580 (1426/5600)
✅ Saved lyrics for song 5111631 (1427/5600)
✅ Saved lyrics for song 898496 (1428/5600)
✅ Saved lyrics for song 198347 (1429/5600)
✅ Saved lyrics for song 814750 (1430/5600)
✅ Saved lyrics for song 717711 (1431/5600)
✅ Saved lyrics for song 732528 (1432/5600)
✅ Saved lyrics for song 336584 (1433/5600)
✅ Saved lyrics for song 599353 (1434/5600)
✅ Saved lyrics for song 198269 (1435/5600)
✅ Saved lyrics for song 428344 (1436/5600)
✅ Saved lyrics for song 580769 (1437/5600)
✅ Saved lyrics for song 207870 (1438/5600)
✅ Saved lyrics for song 132077 (1439/5600)
✅ Saved lyrics for song 2484131 (1440/5600)
✅ Saved lyrics for song 684049 (1441/5600)
✅ Saved lyrics for song 776572 (1442/5600)
✅ Saved lyrics for song 578036 (1443/5600)
✅ Saved lyrics for song 337257 (1444/5600)
✅ Saved lyrics for song 702657 (1445/5600)
✅ Saved lyrics for song 717526 (1446/5600)
✅ Saved 