In [1]:
from dotenv import load_dotenv
import os
import lyricsgenius
import yaml

In [2]:
# Get credentials for the api calls from the .env file
# Niko has the credentials if you need them
load_dotenv()
DATA_PATH = "../../data"
GENRES = ["pop", "rock", "r-b", "country", "rap", "blues", "jazz", "experimental", "folk", "classical-music", "electronic"]

In [3]:
genius = lyricsgenius.Genius()

In [4]:
# Scraping of all existing secondary tags on genius 
# has been done in src/notebooks/scrape-secondary-tags.ipynb
main_tags = ["country", "pop", "r-b", "rap", "rock"]
with open(DATA_PATH + "/secondary_genres.yaml", "r") as f:
    secondary_tags = yaml.safe_load(f)

main_tags, secondary_tags

(['country', 'pop', 'r-b', 'rap', 'rock'],
 ['abstract-rap',
  'a-cappella',
  'acid-house',
  'acid-jazz',
  'acid-techno',
  'acoustic',
  'adult-alternative',
  'adult-contemporary',
  'afro-arab',
  'afrobeat',
  'afrobeats',
  'afro-hip-hop',
  'afro-house',
  'afro-jazz',
  'afrosoul',
  'afroswing',
  'afro-trap',
  'albanian-folk',
  'album-oriented-rock-aor',
  'algerian-rai',
  'algerian-rap',
  'alternative',
  'alternative-country',
  'alternative-dance',
  'alternative-metal',
  'alternative-pop',
  'alternative-rap',
  'alternative-r-b',
  'alternative-rock',
  'amapiano',
  'ambient',
  'ambient-dub',
  'ambient-pop',
  'americana',
  'american-folk',
  'american-underground',
  'anarcho-punk',
  'anime-lo-fi',
  'anime-rap',
  'anthem',
  'anti-folk',
  'arabic-diss',
  'arabic-instrumental',
  'arabic-pop',
  'arabic-rap',
  'arabic-rock',
  'arrocha',
  'art-pop',
  'art-punk',
  'art-rock',
  'ashiq-music',
  'atmospheric-black-metal',
  'atmospheric-sludge-metal',
 

In [5]:
# Try to find all songs for a specified genre
abstract_rap_songs = genius.tag("abstract-rap")["hits"]

In [6]:
# These are appearently not all songs yet
len(abstract_rap_songs)

20

In [7]:
feel_good_inc = abstract_rap_songs[2]
feel_good_inc

{'url': 'https://genius.com/Gorillaz-feel-good-inc-lyrics',
 'title_with_artists': 'Feel Good Inc. by Gorillaz (Ft. De La Soul)',
 'title': 'Feel Good Inc.',
 'artists': ['Gorillaz'],
 'featured_artists': ['De La Soul']}

In [8]:
# Print the lyrics for a specific song
print(genius.lyrics(song_url=feel_good_inc["url"]))

Timeout: Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)

In [None]:
abstract_rap_songs[:2]

[{'url': 'https://genius.com/Childish-gambino-iv-sweatpants-lyrics',
  'title_with_artists': 'IV. Sweatpants by Childish Gambino',
  'title': 'IV. Sweatpants',
  'artists': ['Childish Gambino'],
  'featured_artists': []},
 {'url': 'https://genius.com/Tyler-the-creator-yonkers-lyrics',
  'title_with_artists': 'Yonkers by Tyler, The Creator',
  'title': 'Yonkers',
  'artists': ['Tyler, The Creator'],
  'featured_artists': []},
 {'url': 'https://genius.com/Gorillaz-feel-good-inc-lyrics',
  'title_with_artists': 'Feel Good Inc. by Gorillaz (Ft. De La Soul)',
  'title': 'Feel Good Inc.',
  'artists': ['Gorillaz'],
  'featured_artists': ['De La Soul']}]

In [None]:
# The page feature can be used to find 20 more songs
genius.tag("abstract-rap", page=2)["hits"][:2]

[{'url': 'https://genius.com/Chance-the-rapper-acid-rain-lyrics',
  'title_with_artists': 'Acid Rain by Chance the Rapper',
  'title': 'Acid Rain',
  'artists': ['Chance the Rapper'],
  'featured_artists': []},
 {'url': 'https://genius.com/A-ap-rocky-phoenix-lyrics',
  'title_with_artists': 'Phoenix by A$AP Rocky',
  'title': 'Phoenix',
  'artists': ['A$AP Rocky'],
  'featured_artists': []}]

In [10]:
# Download 1000 songs for the every genre
for genre in GENRES:

    # Iterate through pages of songs in genius
    titles = []
    lyrics = []
    all_songs = []
    for i in range(1, 51):
        pop_songs = genius.tag(genre, page=i)["hits"]
        for song in pop_songs:
            title = song["title_with_artists"]
            all_songs.append(song)
            print(f"Saved '{title}'")

    with open(f"{DATA_PATH}/songs/{genre}.yml", "w") as f:
        yaml.safe_dump(all_songs, f)
    print(f"Succsefffully saved {genre} songs")

Saved 'Closer by The Chainsmokers (Ft. Halsey)'
Saved 'FRIENDS by Marshmello & Anne-Marie'
Saved 'Get Lucky by Daft Punk (Ft. Nile Rodgers & Pharrell Williams)'
Saved 'Something Just Like This by The Chainsmokers & Coldplay'
Saved 'As It Was by Harry Styles'
Saved 'Die For You by The Weeknd'
Saved '​bad guy by Billie Eilish'
Saved 'Nights by Frank Ocean'
Saved 'Let Me Love You by DJ Snake (Ft. Justin Bieber)'
Saved 'Happier by Marshmello & Bastille'
Saved 'Feeling Myself by Nicki Minaj (Ft. Beyoncé)'
Saved 'Poker Face by Lady Gaga'
Saved 'Collard Greens by ScHoolboy Q (Ft. Kendrick Lamar)'
Saved 'Heathens by ​twenty one pilots'
Saved 'Baby by Justin Bieber (Ft. Ludacris)'
Saved 'Blood On the Leaves by Kanye West'
Saved 'SLOW DANCING IN THE DARK by Joji'
Saved '​hot girl bummer by ​blackbear'
Saved 'Slide by Calvin Harris (Ft. Frank Ocean & Migos)'
Saved 'It Ain't Me by Kygo & Selena Gomez'
Saved '​bury a friend by Billie Eilish'
Saved 'Truffle Butter by Nicki Minaj (Ft. Drake & Lil Way

In [15]:
# Download lyrics for all the songs

for genre in GENRES:

    # Create an output folder
    if not os.path.exists(f"{DATA_PATH}/lyrics/{genre}"):
        os.mkdir(f"{DATA_PATH}/lyrics/{genre}")

    # Get the song list for one genre
    with open(f"{DATA_PATH}/songs/{genre}.yml", "r") as f:
        songs = yaml.safe_load(f)

    # The API call for lyrics often times out
    # We repeat it every time it has timed out
    error = True
    while error:
        error = False 

        try:
            # Check the already saved songs
            saved_songs = os.listdir(DATA_PATH + "/lyrics/" + genre)
            n_saved_songs = 0 if len(saved_songs) == 0 else max([int(song[:4]) for song in saved_songs])

            # Download the song lyrics
            for i, song in enumerate(songs[n_saved_songs:]):
                title, url = song
                title: str

                # A Page "Behind The Lyrics" appeared in the country songs
                # This is not a song and is filtered out
                if title.startswith("Behind The Lyrics"):
                    continue

                # The title sometimes has invalid chars for a file name
                # Remove these here
                scary_chars = "\"/?*|"
                for char in scary_chars:
                    title = title.replace(char, "")
                if len(title) > 150:
                    title = title[:150]

                lyrics: str = genius.lyrics(song_url=url)

                # The downlaoded lyrics have a header in the first line and
                # a number + "Embed" on the last line
                # These need to be removed
                lyrics = "\n".join(lyrics.splitlines()[1:])[:-5]
                
                # Skip emtpy lyrics
                if len(lyrics) == 0:
                    continue

                # Remove "Embed" + number from the end
                if lyrics[-1] == 'K':
                    lyrics = lyrics[:-3]
                while lyrics[-1].isdigit():
                    lyrics = lyrics[:-1]

                with open(f"{DATA_PATH}/lyrics/{genre}/{(n_saved_songs + i):04d}_{title}.txt", "w", encoding="utf-8") as f:
                    f.write(lyrics)
                print(f"✅ Saved {n_saved_songs + i}: {title}")
        except Exception as e:
            print(f"❌ {e}")
            error = True


✅ Saved 998: ​making the bed by Olivia Rodrigo
✅ Saved 999: Civil War by Guns N' Roses
✅ Saved 999: Chivalry Is Dead by Trevor Wesley
✅ Saved 994: Hello Beautiful by Noah Schnacky
✅ Saved 31: Without Me by Eminem
✅ Saved 32: Alexander Hamilton by Leslie Odom, Jr., Anthony Ramos, Daveed Diggs, Okieriete Onaodowan, Lin-Manuel Miranda, Phillipa Soo, Christopher Jackson & Orig
✅ Saved 33: The Real Slim Shady by Eminem
✅ Saved 34: XXX. by Kendrick Lamar (Ft. U2)
❌ Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
✅ Saved 34: XXX. by Kendrick Lamar (Ft. U2)
❌ Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
✅ Saved 34: XXX. by Kendrick Lamar (Ft. U2)
✅ Saved 35: Old Town Road (Remix) by Lil Nas X (Ft. Billy Ray Cyrus)
✅ Saved 36: I'm the One by DJ Khaled (Ft. Chance the Rapper, Justin Bieber, Lil Wayne & Quavo)
✅ Saved 37: This Is America by Childish Gambino
✅ Saved 38: ​goosebumps by Tra