In [1]:
from dotenv import load_dotenv
import os
import lyricsgenius
import yaml
import tomllib

In [2]:
# Get credentials for the api calls from the .env file
# Niko has the credentials if you need them
load_dotenv()

# Get config entries
with open("../../config.toml", "rb") as f:
    config = tomllib.load(f)

In [3]:
genius = lyricsgenius.Genius()

In [4]:
# Scraping of all existing secondary tags on genius 
# has been done in src/notebooks/scrape-secondary-tags.ipynb
main_tags = ["country", "pop", "r-b", "rap", "rock"]
with open(config["data_dir"] + "/secondary_genres.yaml", "r") as f:
    secondary_tags = yaml.safe_load(f)

main_tags, secondary_tags

(['country', 'pop', 'r-b', 'rap', 'rock'],
 ['abstract-rap',
  'a-cappella',
  'acid-house',
  'acid-jazz',
  'acid-techno',
  'acoustic',
  'adult-alternative',
  'adult-contemporary',
  'afro-arab',
  'afrobeat',
  'afrobeats',
  'afro-hip-hop',
  'afro-house',
  'afro-jazz',
  'afrosoul',
  'afroswing',
  'afro-trap',
  'albanian-folk',
  'album-oriented-rock-aor',
  'algerian-rai',
  'algerian-rap',
  'alternative',
  'alternative-country',
  'alternative-dance',
  'alternative-metal',
  'alternative-pop',
  'alternative-rap',
  'alternative-r-b',
  'alternative-rock',
  'amapiano',
  'ambient',
  'ambient-dub',
  'ambient-pop',
  'americana',
  'american-folk',
  'american-underground',
  'anarcho-punk',
  'anime-lo-fi',
  'anime-rap',
  'anthem',
  'anti-folk',
  'arabic-diss',
  'arabic-instrumental',
  'arabic-pop',
  'arabic-rap',
  'arabic-rock',
  'arrocha',
  'art-pop',
  'art-punk',
  'art-rock',
  'ashiq-music',
  'atmospheric-black-metal',
  'atmospheric-sludge-metal',
 

In [7]:
# Try to find all songs for a specified genre
abstract_rap_songs = genius.tag("abstract-rap")["hits"]

In [8]:
# These are appearently not all songs yet
len(abstract_rap_songs)

20

In [9]:
feel_good_inc = abstract_rap_songs[2]
feel_good_inc

{'url': 'https://genius.com/Gorillaz-feel-good-inc-lyrics',
 'title_with_artists': 'Feel Good Inc. by Gorillaz (Ft. De La Soul)',
 'title': 'Feel Good Inc.',
 'artists': ['Gorillaz'],
 'featured_artists': ['De La Soul']}

In [10]:
# Print the lyrics for a specific song
print(genius.lyrics(song_url=feel_good_inc["url"]))

348 ContributorsTranslationsPolskiItalianoNederlandsFeel Good Inc. Lyrics[Intro: Maseo & 2-D]
Hahahahahahahahahahahahahahahahaha
Feel good
Sha, sha-ba-da, sha-ba-da-ca; feel good
Sha, sha-ba-da, sha-ba-da-ca; feel good
Sha, sha-ba-da, sha-ba-da-ca; feel good
Sha, sha-ba-da, sha-ba-da-ca; feel good
Sha, sha-ba-da, sha-ba-da-ca; feel good
Sha, sha-ba-da, sha-ba-da-ca; feel good
(Change, change, change, change)
Sha, sha-ba-da, sha-ba-da-ca; feel good
(Change, change, change, change)
Sha, sha-ba-da, sha-ba-da-ca; feel good

[Verse 1: 2-D]
City's breaking down on a camel's back
They just have to go, 'cause they don't know wack
So while you fill the streets, it's appealing to see
And you won't get out the county 'cause you're damned and free
You got a new horizon, its ephemeral style
A melancholy town where we never smile
And all I wanna hear is the message beep
My dreams, they've got to kiss me 'cause I don't get to sleep, no
(Beep)

[Chorus: 2-D]
Windmill, windmill for the land
Turn foreve

In [None]:
abstract_rap_songs[:2]

[{'url': 'https://genius.com/Childish-gambino-iv-sweatpants-lyrics',
  'title_with_artists': 'IV. Sweatpants by Childish Gambino',
  'title': 'IV. Sweatpants',
  'artists': ['Childish Gambino'],
  'featured_artists': []},
 {'url': 'https://genius.com/Tyler-the-creator-yonkers-lyrics',
  'title_with_artists': 'Yonkers by Tyler, The Creator',
  'title': 'Yonkers',
  'artists': ['Tyler, The Creator'],
  'featured_artists': []},
 {'url': 'https://genius.com/Gorillaz-feel-good-inc-lyrics',
  'title_with_artists': 'Feel Good Inc. by Gorillaz (Ft. De La Soul)',
  'title': 'Feel Good Inc.',
  'artists': ['Gorillaz'],
  'featured_artists': ['De La Soul']}]

In [None]:
# The page feature can be used to find 20 more songs
genius.tag("abstract-rap", page=2)["hits"][:2]

[{'url': 'https://genius.com/Chance-the-rapper-acid-rain-lyrics',
  'title_with_artists': 'Acid Rain by Chance the Rapper',
  'title': 'Acid Rain',
  'artists': ['Chance the Rapper'],
  'featured_artists': []},
 {'url': 'https://genius.com/A-ap-rocky-phoenix-lyrics',
  'title_with_artists': 'Phoenix by A$AP Rocky',
  'title': 'Phoenix',
  'artists': ['A$AP Rocky'],
  'featured_artists': []}]

In [6]:
# Download 1000 songs for the every genre
for genre in config["genres"]:

    # Iterate through pages of songs in genius
    titles = []
    lyrics = []
    all_songs = []
    for i in range(1, 51):
        songs_on_page = genius.tag(genre, page=i)["hits"]
        for song in songs_on_page:
            title = song["title_with_artists"]

            # A Page "Behind The Lyrics" appeared in the country songs
            # This is not a song and is filtered out
            if title.startswith("Behind The Lyrics"):
                continue

            all_songs.append(song)
            print(f"Saved '{title}'")

    with open(f"{config['multilabel_data_dir']}/songs/{genre}.yml", "w") as f:
        yaml.safe_dump(all_songs, f)
    print(f"Succsefffully saved {genre} songs")

Saved 'Despacito (Remix) by Luis Fonsi & Daddy Yankee (Ft. Justin Bieber)'
Saved 'Shape of You by Ed Sheeran'
Saved 'The Hills by The Weeknd'
Saved 'Bohemian Rhapsody by Queen'
Saved 'God's Plan by Drake'
Saved '​thank u, next by Ariana Grande'
Saved 'Drunk in Love by Beyoncé (Ft. JAY-Z)'
Saved 'Starboy by The Weeknd (Ft. Daft Punk)'
Saved '1-800-273-8255 by Logic (Ft. Alessia Cara & Khalid)'
Saved 'Work by Rihanna (Ft. Drake)'
Saved 'SAD! by XXXTENTACION'
Saved 'Too Good at Goodbyes by Sam Smith'
Saved '7 rings by Ariana Grande'
Saved 'In My Feelings by Drake'
Saved 'Trap Queen by Fetty Wap'
Saved '​​rockstar by Post Malone (Ft. 21 Savage)'
Saved 'Hotline Bling by Drake'
Saved 'Perfect by Ed Sheeran'
Saved 'Closer by The Chainsmokers (Ft. Halsey)'
Saved 'Believer by Imagine Dragons'
Saved '​i​ hate u, i love u by ​gnash (Ft. Olivia O'Brien)'
Saved 'Swimming Pools (Drank) by Kendrick Lamar'
Saved 'Money Trees by Kendrick Lamar (Ft. Jay Rock)'
Saved 'Formation by Beyoncé'
Saved 'One Dan

KeyboardInterrupt: 

In [21]:
# Download lyrics for all the songs

for genre in config["genres"]:

    # Create an output folder
    if not os.path.exists(f"{config['multilabel_data_dir']}/lyrics/{genre}"):
        os.mkdir(f"{config['multilabel_data_dir']}/lyrics/{genre}")

    # Get the song list for one genre
    with open(f"{config['multilabel_data_dir']}/songs/{genre}.yml", "r") as f:
        songs = yaml.safe_load(f)

    # The API call for lyrics often times out
    # We repeat it every time it has timed out
    error = True
    while error:
        error = False 

        try:
            # Check the already saved songs
            saved_songs = os.listdir(config["multilabel_data_dir"] + "/lyrics/" + genre)
            n_saved_songs = 0 if len(saved_songs) == 0 else max([int(song[:4]) for song in saved_songs])

            # Download the song lyrics
            for i, song in enumerate(songs[n_saved_songs:]):
                title: str = song["title"]
                url: str = song["url"]

                # The title sometimes has invalid chars for a file name
                # Remove these here
                scary_chars = "\"/?*|"
                for char in scary_chars:
                    title = title.replace(char, "")
                if len(title) > 150:
                    title = title[:150]

                lyrics: str = genius.lyrics(song_url=url)

                # The downlaoded lyrics have a header in the first line and
                # a number + "Embed" on the last line
                # These need to be removed
                lyrics = "\n".join(lyrics.splitlines()[1:])[:-5]
                
                # Skip emtpy lyrics
                if len(lyrics) == 0:
                    continue

                # Remove "Embed" + number from the end
                if lyrics[-1] == 'K':
                    lyrics = lyrics[:-3]
                while lyrics[-1].isdigit():
                    lyrics = lyrics[:-1]

                with open(f"{config['multilabel_data_dir']}/lyrics/{genre}/{(n_saved_songs + i):04d}_{title}.txt", "w", encoding="utf-8") as f:
                    f.write(lyrics)
                print(f"✅ Saved {genre} {n_saved_songs + i}: {title}")
        except Exception as e:
            print(f"❌ {e}")
            error = True


✅ Saved pop 998: I Wish
✅ Saved rock 999: ​we think too much
✅ Saved r-b 999: WEST DISTRICT
✅ Saved rap 999: All My Friends
✅ Saved blues 998: Hesitation Blues
✅ Saved jazz 998: Midnight Blue
✅ Saved experimental 996: Eat, Sleep, Wake (Nothing But You)
✅ Saved folk 999: Young & Free
✅ Saved classical-music 998: Blackout
✅ Saved electronic 999: Summertime
