In [1]:
import os #needed for the API for example (to access environment variables)
from dotenv import load_dotenv #needed to load environmet variables to this project
import tensorflow as tf #for the RNN (recurring neural network)
import keras #for the RNN (recurring neural network)
from keras import layers #for the RNN (recurring neural network)
import numpy as np
import os #to interact with the folders of this project
import time #this will allow us to create a time buffer between each lyrics extraction 
import lyricsgenius as lg #(to work with the genius.com API)
import requests #to request data from Genius API (sending http requests)
import json #to read the extracted info from the API (encoding and decoding the info we got)
import pandas as pd
from requests.exceptions import SSLError #to handle exceptions when fetching data from Genius
import re
import logging #for the logbook I created to record the errors while fetching lyrics from the API genius




In [2]:
load_dotenv()

BASE_URL = "https://api.genius.com"
genius_client_id = os.environ.get('CLIENT_ID')
genius_client_secret = os.environ.get('CLIENT_SECRET')
genius_client_token = os.getenv("CLIENT_ACCESS_TOKEN")

In [3]:
print(f"Token: {genius_client_token}")

Token: V7dLhYuBu1ypbz_8VCDW34aX-SfxvIQP4TEAJZrFlFblIpku8xJSONri9_yPPkoD


# Cleaning special characters from artists' names:

In [13]:
 def clean_artist_name(artist_name):
    # Replace special characters with underscores
    cleaned_name = re.sub(r"[^\w\s]", "_", artist_name)
    return cleaned_name

# Setting the request to fetch metadata from lyricsgenius:

In [14]:
BASE_URL = "https://api.genius.com"
DATA_FOLDER = "data"  # we are saving the outputs of this code in our data folder

def _get(path, params=None, headers=None):
    
    """A function to send an http get request to our genius API based on a url that we've built."""
    
    requrl = '/'.join([BASE_URL, path])
    token = "Bearer {}".format(genius_client_token)
    if headers:
        headers['Authorization'] = token
    else:
        headers = {"Authorization": token}

    response = requests.get(url=requrl, params=params, headers=headers)
    response.raise_for_status()

    return response.json()

def clean_artist_name(artist_name):
    
    """Since I was having some issues with names like 2Pac, Lil` Kim and others with special 
    characters in their names, this codes handles that by replacing the spec chars with an "_"
    """
    
    cleaned_name = re.sub(r"[^\w\s]", "_", artist_name)
    return cleaned_name




def get_artist_songs(artist_name):
    
    """A function to identify the artists songs in Genius. Each song as a unique ID"""
    
    print(f"Searching {artist_name}'s artist id.\n")

    find_id = _get("search", {'q': artist_name})
    print("Search response:", find_id)  # Add this line to print the response

    for hit in find_id["response"]["hits"]:
        if hit["result"]["primary_artist"]["name"] == artist_name:
            artist_id = hit["result"]["primary_artist"]["id"]
            break

    print(f"-> {artist_name}'s id is {artist_id}\n")
    print("Getting song ids.\n")

    current_page = 1
    next_page = True
    songs = []

    while next_page:
        path = f"artists/{artist_id}/songs/"
        params = {'page': current_page}
        data = _get(path=path, params=params)

        page_songs = data['response']['songs']

        if page_songs:
            songs += page_songs
            current_page += 1
        else:
            next_page = False

    songs = [song["id"] for song in songs if song["primary_artist"]["id"] == artist_id]

    return songs


def get_song_information(song_ids, artist_name):
    
    """Retrieves information about each song given a list of song IDs."""
    
    print("Getting meta data of each song.\n")
    
    song_list = {}

    for i, song_id in enumerate(song_ids):
        print(f"id:{song_id} start. ->")

        path = f"songs/{song_id}"
        data = _get(path=path)["response"]["song"]

        song_list.update({
            i: {
                "title": data["title"],
                "album": data["album"]["name"] if data["album"] else "<single>",
                "release_date": data["release_date"] if data["release_date"] else "unidentified",
                "featured_artists": [feat["name"] for feat in data["featured_artists"]],
                "producer_artists": [feat["name"] for feat in data["producer_artists"]],
                "writer_artists": [feat["name"] for feat in data["writer_artists"]],
                "genius_track_id": song_id,
                "genius_album_id": data["album"]["id"] if data["album"] else "none"
            }
        })

        print(f"-> id:{song_id} is finished.\n")

    return song_list

# Storing the metadata into a JSON in the data folder:

In [15]:
def get_tracks_info_for_artist(artist_name):
    
    """
    A function that compiles the http request, artists and songs ID's and song's metadata retrieval
    """
    
    cleaned_artist_name = clean_artist_name(artist_name)
    
    song_ids = get_artist_songs(artist_name)

    data_folder_path = os.path.join(os.getcwd(), DATA_FOLDER)
    os.makedirs(data_folder_path, exist_ok=True)

    song_ids_filename = f"{cleaned_artist_name} Genius Song IDs.txt"
    song_ids_filepath = os.path.join(data_folder_path, song_ids_filename)

    with open(song_ids_filepath, "w") as f:
        f.write(str(song_ids))

    print(song_ids)
    print("\n-> Got all the song ids. Take a break for a while.\n")

    time.sleep(30)

    full_list_of_songs = get_song_information(song_ids, artist_name)

    songs_filename = f"{cleaned_artist_name} Songs.json"
    songs_filepath = os.path.join(data_folder_path, songs_filename)

    with open(songs_filepath, "w") as f:
        json.dump(full_list_of_songs, f)

    print("-> Finished! Data stored in JSON and txt files in the 'data' folder.\n")
    print(f"-> Check out the '{cleaned_artist_name} Songs.json' file and '{cleaned_artist_name} Genius Song IDs.txt' file.")


# Loading the JSON into the metadata df

### Creating the dataframe that will be the container for all the metadata:

In [16]:
songs_metadata_df = pd.DataFrame(columns=[
    "title", "album", "release_date", "featured_artists",
    "producer_artists", "writer_artists", "genius_track_id", "genius_album_id","artist"])
songs_metadata_df

Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id,artist


### List with all the dumped JSON

In [17]:
#we've got a list of all the extracted metadata:
artists_songs_metadata = [
    "2Pac Songs", "21 Savage Songs", "50 Cent Songs", "A Tribe Called Quest Songs",
    "A_AP Rocky Songs", "Aesop Rock Songs", "Atmosphere Songs", "Azealia Banks Songs",
    "Baby Keem Songs", "Big Daddy Kane Songs", "Big L Songs", "Big Pun Songs",
    "Big Sean Songs", "Cardi B Songs", "Chance the Rapper Songs", "Chief Keef Songs",
    "Childish Gambino Songs", "Common Songs", "Cordae Songs", "Cypress Hill Songs",
    "Danny Brown Songs", "De La Soul Songs", "DMX Songs", "Doja Cat Songs",
    "Dr_ Dre Songs", "Drake Songs", "Earl Sweatshirt Songs", "Eminem Songs",
    "Eric B_ _ Rakim Songs", "Future Songs", "Ghostface Killah Songs", "Gucci Mane Songs",
    "Hopsin Songs", "Ice Cube Songs", "Ice Spice Songs", "J. Cole Songs",
    "Jack Harlow Songs", "JAY_Z Songs", "Jeezy Songs", "Joey Bada__ Songs",
    "Joyner Lucas Songs", "Juice WRLD Songs", "Kanye West Songs", "Kendrick Lamar Songs",
    "Kid Cudi Songs", "KMD Songs", "Lauryn Hill Songs", "Lil Nas X Songs",
    "Lil Uzi Vert Songs", "Lil Wayne Songs", "Lil Yachty Songs", "Lil_ Kim Songs",
    "Logic Songs", "Lupe Fiasco Songs", "Mac Miller Songs", "Masta Ace Songs",
    "Megan Thee Stallion Songs", "MF DOOM Songs", "Migos Songs", "Missy Elliott Songs",
    "Mobb Deep Songs", "Nas Songs", "Nipsey Hussle Songs", "OutKast Songs",
    "Playboi Carti Songs", "Pop Smoke Songs", "Public Enemy Songs", "Pusha T Songs",
    "Queen Latifah Songs", "Run_DMC Songs", "Scarface Songs", "Slick Rick Songs",
    "Snoop Dogg Songs", "Talib Kweli Songs", "The Notorious B.I.G. Songs", "The Pharcyde Songs",
    "Travis Scott Songs", "Vince Staples Songs", "Wu_Tang Clan Songs", "Yasiin Bey Songs",
    "Young Thug Songs"]


print(f"it's got {len(artists_songs_metadata)} artists!")


it's got 81 artists!


### Automating a function that loads the artist JSON into a Dataframe and perfoms some initial cleaning

In [18]:
def load_artist_json(artist_name):
    # Load the JSON from our data folder
    json_file_path = f"data/{artist_name}.json"
    with open(json_file_path, "r", encoding="utf-8") as file:
        json_data = json.load(file)

    # Convert the loaded JSON data to a DataFrame and transpose it
    df = pd.DataFrame(json_data).transpose()  # transpose because columns were rows and vice versa

    # adding a new column "artist" filled with the artist's name
    df['artist'] = artist_name

    # dropping all the rows where the "album" column contains "<single>"
    df = df[df['album'] != '<single>']

    # calculating the count of each album
    album_counts = df['album'].value_counts()

    # filtering out rows where the album count is less than 6
    df = df[df['album'].map(album_counts) >= 6]

    # converting the release_date column to year only
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year
    df['release_date'] = df['release_date'].replace(np.nan, '').astype(str).str.split('.').str[0].replace('unidentified', '')

    return df


### Automating a function that adds (concatenates) the artist dataframe to the main songs_metadata_df and cleans some additional rows

In [19]:
for artist_name in artists_songs_metadata:
    # Load and clean data for each artist
    result_df = load_artist_json(artist_name)
    
    # Append the cleaned dataframe to songs_metadata_df
    songs_metadata_df = pd.concat([songs_metadata_df, result_df], ignore_index=True)
    

    #big chunk of code incoming: essentially I want to drop the " Songs" part from the artist name
    #I also want just the release dates that actually have a year an want them in integers, not strings
    #but will do a caroussel to first delete the spaces and then back to integers:
# Remove the " Songs" from the artist column:
songs_metadata_df['artist'] = songs_metadata_df['artist'].str.replace(' Songs', '')

#removing the rows where no release date:
songs_metadata_df = songs_metadata_df.dropna(subset=['release_date'])

#removingrows where release_date is empty or contains only whitespace
songs_metadata_df = songs_metadata_df[songs_metadata_df['release_date'].str.strip() != '']

#convert 'release_date' column to datetime
songs_metadata_df['release_date'] = pd.to_datetime(songs_metadata_df['release_date'], errors='coerce')

#to extract the year from the 'release_date' column
songs_metadata_df['release_date'] = songs_metadata_df['release_date'].dt.year

#let's drop the rows with NaN in 'release_date'
songs_metadata_df.dropna(subset=['release_date'], inplace=True)

#it'll help me later on to sort it by artist:
songs_metadata_df = songs_metadata_df.sort_values(by='artist') 

#and finally convert it back to 'release_date' to integers
songs_metadata_df['release_date'] = songs_metadata_df['release_date'].astype('Int64')

#reseting the index:
songs_metadata_df.reset_index(drop=True, inplace=True)

FileNotFoundError: [Errno 2] No such file or directory: 'data/2Pac Songs.json'

In [10]:
songs_metadata_df

Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id,artist


## Some extra ad-hoc human based cleaning:

In [54]:
# I want to print lists for each artist with their unique albums so that I can, for each list, decide what albums to keep:
unique_artists = songs_metadata_df['artist'].unique()
artist_albums = {artist: [] for artist in unique_artists}
for _, row in songs_metadata_df.iterrows():
    artist = row['artist']
    album = row['album']
    
    if album not in artist_albums[artist]:
        artist_albums[artist].append(album)
for artist, albums in artist_albums.items():
    print(f"{artist}_albums: {albums}\n")

2Pac_albums: ['R U Still Down? (Remember Me)', 'All Eyez On Me', '2Pac Live', 'Live at the House Of Blues', 'Nu-Mixx Klazzics', 'Strictly 4 My N.I.G.G.A.Z...', 'The Don Killumanati: The 3 Day Theory', 'Until the End of Time', 'Immortal', 'The Rose, Vol. 2: Music Inspired by Tupac’s Poetry', 'The Rose That Grew From Concrete, Vol. 1', 'Better Dayz', 'Loyal To The Game', 'The Don Killuminati: The 7 Day Theory', '2Pacalypse Now', 'Troublesome ’21', 'Me Against The World', 'Greatest Hits', 'Tales Of A 90’s N.I.G.G.A.', 'In His Own Words', 'Pac’s Life', 'Out on Bail', 'Resurrection', 'Nu Mixx Klazzics, Vol. 2', 'The Rose That Grew From Concrete', 'One Nation: Volume 1 (7 Dayz) (Unreleased)', 'Beginnings: The Lost Tapes 1988-1991']

21 Savage_albums: ['Free Guwop EP', 'i am﹥i was', 'Issa Album', 'Slaughter King', '21S3*', 'The Slaughter Tape']

50 Cent_albums: ['Get Rich Or Die Tryin’', 'Guess Who’s Back? Again', 'Guess Who’s Back?', 'Before I Self Destruct (2008)', 'The Massacre', 'Greatest

### Following is a sequence of the final lists of albums I want to keep for each artist. These lists where defined manually by observation of what albums make sense to keep.

In [55]:
_2Pac = ['R U Still Down? (Remember Me)', 'All Eyez On Me', '2Pac Live', 'Nu-Mixx Klazzics', 'Strictly 4 My N.I.G.G.A.Z...', 'The Don Killumanati: The 3 Day Theory', 'Greatest Hits', 'Until the End of Time', 'Better Dayz', 'Loyal To The Game', 'The Don Killuminati: The 7 Day Theory', '2Pacalypse Now', 'Troublesome ’21', 'Me Against The World', 'Resurrection']

In [56]:
_21Savage = ['i am﹥i was', 'Issa Album', 'Slaughter King', 'The Slaughter Tape']

In [57]:
_50Cent = ['Get Rich Or Die Tryin’', 'Guess Who’s Back?', '50 Cent Is The Future', 'The Massacre', 'Curtis', 'Before I Self Destruct', 'The Lost Tape', 'Power of the Dollar', 'War Angel LP', 'The Kanan Tape', 'The Big 10', 'No Mercy No Fear', 'Bulletproof: The Mixtape', 'Elephant in the Sand (G Unit Volume II)', 'Forever King']

In [58]:
_ATribeCalledQuest = ['Beats, Rhymes, and Life', 'The Love Movement', 'Midnight Marauders', 'People’s Instinctive Travels and the Paths of Rhythm', 'We got it from Here... Thank You 4 Your service', 'The Low End Theory', 'Revised Quest For The Seasoned Traveller', 'The Lost Tribes']

In [59]:
_A_APRocky = ['LONG.LIVE.A$AP (Deluxe)', 'TESTING', 'LIVE.LOVE.A$AP', 'AT.LONG.LAST.A$AP']

In [60]:
_AesopRock = ['Skelethon', 'Bazooka Tooth', 'Spirit World Field Guide', 'None Shall Pass', 'Float', 'Labor Days', 'Daylight - EP', 'The Impossible Kid', 'Fast Cars, Danger, Fire and Knives - EP']

In [61]:
_Atmosphere = ['Overcast!', 'Satan Hates Beauty', 'Sad Clown Bad Year (#9-#12 Collection)', 'God Loves Ugly', 'The Family Sign', 'Fishing Blues', 'Seven’s Travels', 'You Can’t Imagine How Much Fun We’re Having', 'Mi Vida Local', 'Southsiders', 'Whenever', 'When Life Gives You Lemons, You Paint That Shit Gold']
_AzealiaBanks = ['Broke with Expensive Taste', 'Slay-Z', 'Fantasea II: The Second Wave', 'Fantasea']
_BabyKeem = ['The Melodic Blue', 'DIE FOR MY BITCH', 'Oct']
_BigDaddyKane = ['Veteranz Day', 'Daddy’s Home', 'Long Live the Kane', 'It’s a Big Daddy Thing', 'Taste of Chocolate', 'Prince of Darkness', 'Looks Like a Job For...']
_BigL = ['Lifestylez ov da Poor & Dangerous', 'The Danger Zone', 'The Big Picture']
_BigPun = ['Yeeeah Baby', 'Capital Punishment', 'Endangered Species']
_BigSean = ['Detroit', 'Finally Famous (Super Deluxe Edition)', 'Hall of Fame (Deluxe)', 'Dark Sky Paradise (Deluxe)', 'I Decided.', 'Detroit 2']
_CardiB = ['Invasion of Privacy']
_ChanceTheRapper = ['10 Day', 'The Big Day', 'Acid Rap', 'Coloring Book']
_ChiefKeef = ['Finally Rich (Exclusive Edition)', 'Back from the Dead', 'Bang 3', 'Finally Rollin 2 (Glo’d Up Deluxe Edition)', 'Thot Breaker', 'The Cozart', 'Dedication', 'Mansion Musick']
_ChildishGambino = ['3.15.20', 'Camp (Deluxe Version)', 'STN MTN / Kauai', '“Awaken, My Love!”', 'Because the Internet', 'Guava Island']

In [62]:
_Common = ['One Day It’ll All Make Sense', 'Nobody’s Smiling', 'A Beautiful Revolution, Pt. 2', 'A Beautiful Revolution Pt 1', 'Black America Again', 'Like Water for Chocolate', 'Universal Mind Control', 'Resurrection', 'Can I Borrow a Dollar?', 'Electric Circus', 'Be', 'The Dreamer/The Believer', 'Finding Forever', 'Let Love']
_Cordae = ['The Lost Boy', 'From a Birds Eye View']
_CypressHill = ['Cypress Hill IV', 'Black Sunday', 'Skull & Bones', 'Stoned Raiders', 'Till Death Do Us Part', 'Rise Up', 'Elephants on Acid', 'Cypress Hill III: Temples of Boom', 'Unreleased & Revamped', 'Cypress Hill', 'Back in Black', 'Los Grandes Éxitos en Español']
_DannyBrown = ['Old', 'XXX', 'uknowhatimsayin¿', 'Atrocity Exhibition']
_DeLaSoul = ['Buhloone Mindstate', 'Stakes Is High', 'De La Soul is Dead', '3 Feet High and Rising', 'Art Official Intelligence: Mosaic Thump', 'AOI: Bionix', 'The Impossible: Mission (TV Series: Pt. 1)', 'Breakadawn', 'and the Anonymous Nobody...', 'The Grind Date']
_DMX = ['Flesh of My Flesh, Blood of My Blood', '...And Then There Was X', 'The Great Depression', 'It’s Dark and Hell is Hot', 'Grand Champ', 'Year of the Dog... Again', 'Exodus']
_DojaCat = ['Hot Pink (Apple Music Deluxe)', 'Planet Her', 'Amala']
_DrDre = ['2001', 'Compton', 'The Chronic']
_Drake = ['If You’re Reading This It’s Too Late', 'Nothing Was the Same', 'More Life', 'Certified Lover Boy', 'Scorpion', 'Views', 'Thank Me Later', 'So Far Gone', 'Take Care', 'Dark Lane Demo Tapes']
_EarlSweatshirt = ['Doris', 'FEET OF CLAY', 'I Don’t Like Shit, I Don’t Go Outside: An Album by Earl Sweatshirt', 'Some Rap Songs', 'Earl']
_Eminem = ['Recovery', 'Infinite', 'Relapse', 'Music to Be Murdered By', 'The Slim Shady LP', 'The Marshall Mathers LP', 'Revival', 'The Marshall Mathers LP 2 (Deluxe Edition)', 'Encore', 'Kamikaze', 'The Eminem Show']
_EricBRakim = ['Paid in Full', 'Follow the Leader', 'Don’t Sweat the Technique']

In [63]:
_Future = ['High Off Life', 'Monster', 'BEASTMODE 2', 'Dirty Sprite', '56 Nights', 'I NEVER LIKED YOU', 'Astronaut Status', '1000', 'Beast Mode', 'Future Hndrxx Presents: The WIZRD', 'EVOL', 'Purple Reign', 'Pluto', 'SUPERFLY (Original Motion Picture Soundtrack)', 'Streetz Calling', 'Honest', 'True Story', 'Kno Mercy', 'DS2 (Deluxe)', 'Black Woodstock: The Soundtrack', 'F.B.G: The Movie', 'No Sleep', 'HNDRXX', 'FUTURE', 'SAVE ME']
_GhostfaceKillah = ['Ironman', 'Apollo Kids', 'Fishscale', 'More Fish', 'Supreme Clientele', 'Ghostdini: Wizard of Poetry in Emerald City', '36 Seasons', 'The Pretty Toney Album', 'GhostDeini the Great', 'Hidden Darts: Special Edition', 'The Lost Tapes', 'Ghostface Killahs', 'Bulletproof Wallets', 'The Big Doe Rehab']
_GucciMane = ['Trap-Tacular', 'Writing On The Wall', 'So Icy Gang, Vol. 1', '1017 Mafia', 'La Flare', 'East Atlanta Santa 3', 'Back to the Traphouse', 'No Pad, No Pencil', 'The Return of Mr. Zone 6', 'Wilt Chamberlain, Pt. 2', 'Trap-A-Thon', 'Goochsomnia: Addicted to Gucci', 'Chicken Talk 2', 'East Atlanta Santa', 'Who Framed Radric Davis ', 'So Icy Boyz: The Finale', 'Droptopwop', 'Trap God 3', 'Chicken Talk', 'Burrrprint (2) HD', 'World War 3, Vol. 1: Lean', 'Trap God', 'WOPTOBER', 'The Movie (Gangsta Grillz)', 'Brick Factory: Volume 1', 'Brick Factory: Volume 3', 'Breakfast ', 'So Icy Christmas', 'The State vs. Radric Davis (Deluxe)', 'The State vs. Radric Davis (Instrumentals)', 'Hard to Kill', 'Trapology', 'Gucci Sosa', 'Diary of a Trap God', 'Gangsta Grillz: The Movie Part 2 (The Sequel)', 'Trap House 4', 'The Return of Mr. Perfect', 'All Things White', 'Views from Zone 6', 'The State vs Radric Davis 2: The Caged Bird Sings ', 'I’m Up', 'Lebron Gucci James', 'Everybody Looking', 'World War 3, Vol. 2: Molly', 'Trap Back', 'Mr. Davis', 'Hood Classics 3', 'Wilt Chamberlain, Pt. 6', 'Delusions of Grandeur', 'World War 3, Vol. 3: Gas ', 'Brick Factory, Vol. 2', 'The Return of East Atlanta Santa', 'The Spot Soundtrack', 'Trust God Fuck 12', 'Ice Attack ', 'Ferrari Music', 'Woptober II', 'Ice Attack 2', 'Trap God 2', 'Evil Genius', 'The Oddfather', 'Trap House', 'Murder Was the Case', 'So Icy Gang: The ReUp', 'Hood Classics 2', 'From Zone 6 To Duval', 'Jewelry Selection', 'Gucci vs. Guwop', 'Gucci Mane Presents: So Icy Summer', 'Gucci 2 Time', 'EA Sportscenter', 'Mr Clean, The Middle Man', 'Ice Daddy', 'Writing on the Wall 2', 'The Burrprint (The Movie 3D)', 'Trap House III', 'Trap Back 2 ', '1017 Up Next ', 'Dinner ', 'Lunch', 'Trap House 5: The Final Chapter', 'Gucci 3D', 'Mr. Zone 6', 'Wilt Chamberlain', 'Bird Flu ', 'El Gato: The Human Glacier', '80 Gz', 'East Atlanta Santa 2: The Night GuWop Stole X-Mas', 'Brick Squad Mafia', 'Mr. Perfect', 'King Gucci', 'Atlanta Gave Me Vision', 'The Appeal: Georgia’s Most Wanted (Deluxe)', 'Sinden Presents: Free Gucci II: The Burrrtish Edition']
_Hopsin = ['Emurge', 'L.A.U.S.D. Result', 'No Shame', 'Raw', 'Knock Madness', 'Gazing at the Moonlight', 'Pound Syndrome', 'Ill Mind of Hopsin Saga']
_IceCube = ['War & Peace Vol. 2 (The Peace Disc)', 'Bootlegs & B-Sides', 'Laugh Now, Cry Later', 'Death Certificate', 'I Am the West', 'AmeriKKKa’s Most Wanted', 'Everythang’s Corrupt', 'Remain Calm: The Preload to Everythang’s Corrupt', 'War & Peace Vol. 1 (The War Disc)', 'Raw Footage', 'DJ Drank’s Greatest Malt Liquor Hits', 'Lethal Injection', 'The Predator', 'Kill At Will']
_IceSpice = ['Like..? (Deluxe)']
_JCole = ['2014 Forest Hills Drive', 'Forest Hills Drive: Live from Fayetteville, NC', 'KOD', 'Friday Night Lights', '4 Your Eyez Only', 'The Off-Season', 'Born Sinner', 'Cole World: The Sideline Story', 'The Warm Up', 'The Come Up']
_JackHarlow = ['Thats What They All Say', 'Sweet Action', 'Extra Credit', 'Loose', 'Finally Handsome', 'Jackman.', 'CONFETTI', 'Come Home The Kids Miss You', 'Gazebo', '18']
_JAYZ = ['The Blueprint² The Gift & The Curse', 'Reasonable Doubt', 'Kingdom Come', '4:44', 'The Black Album', 'The Blueprint', 'The Blueprint 3', 'In My Lifetime, Vol. 1', 'American Gangster', 'Vol.\xa03… Life and Times of S.\xa0Carter', 'Vol.\xa02… Hard Knock Life', 'The S. Carter Collection', 'Magna Carta... Holy Grail', 'Demo Tape', 'The Dynasty: Roc La Familia']
_Jeezy = ['TM104: The Legend of the Snowman', 'Seen It All: The Autobiography', 'TM:103 Hustlerz Ambition', 'The Inspiration', 'Can’t Ban the Snowman', 'Let’s Get It: Thug Motivation 101', 'The Real Is Back', 'Trap or Die 3', 'It’s Tha World', 'The Real Is Back 2', 'The Last Laugh', 'The Recession 2', 'The Recession', 'PRESSURE', 'Life Of A Trapstar (Mixtape)', 'Twenty/20 Pyrex Vision', 'Gangsta Party', 'Trap Or Die 2: By Any Means Necessary', '1000 Grams', 'Church In These Streets', 'Trap Or Die', 'I Might Forgive... But I Don’t Forget', 'Thuggin Under The Influence', 'Snowman - Mo Icey']
_JoeyBadass = ['Summer Knights', 'ALL-AMERIKKKAN BADA$$', 'B4.DA.$$', 'Rejex', '1999']
_JoynerLucas = ['ADHD', 'Listen To Me', 'Along Came Joyner', 'Backwords', 'LFO’s (Low Frequency Oscillators)', 'EVOLUTION', '508-507-2209']
_JuiceWRLD = ['Death Race for Love', 'Goodbye & Good Riddance', 'Legends Never Die', 'The Party Never Ends']

In [64]:
_KanyeWest = ['I’m Good...', 'Kanye West’s Visionary Streams of Consciousness', 'Donda', 'The Life of Pablo', 'Late Registration', 'The College Dropout', '“College Dropout” Instrumentals', 'Late Orchestration', 'ye', 'My Beautiful Dark Twisted Fantasy', '808s & Heartbreak', 'VH1 Storytellers', 'Get Well Soon...', 'Graduation', "97' Beat Tape", 'Yandhi', 'Yeezus', 'Donda 2', 'Can’t Tell Me Nothing', 'G.O.O.D. Fridays', 'JESUS IS KING', 'Donda (Deluxe)', 'The College Dropout: Video Anthology']
_KendrickLamar = ['Section.80', 'O(verly) D(edicated)', 'To Pimp a Butterfly', 'C4', 'Training Day', 'Mr. Morale & The Big Steppers', 'good kid, m.A.A.d city (Deluxe Version)', 'Y.H.N.I.C. (Hub City Threat: Minor of the Year)', 'DAMN.', 'Kendrick Lamar', 'Compton State of Mind', 'untitled unmastered.']
_KidCudi = ['Dat Kid From Cleveland', 'Man on the Moon III: The Chosen', 'A Kid Named Cudi', 'Speedin’ Bullet 2 Heaven', 'Indicud', 'Man on the Moon: The End of Day', 'Man on the Moon II: The Legend of Mr. Rager', 'Passion, Pain & Demon Slayin’', 'Entergalactic', 'KiD CuDi presents SATELLITE FLIGHT: The journey to Mother Moon', 'CUDDER IS BACK', 'Rap Hard']
_KMD = ['Mr. Hood', 'BL_CK B_ST_RDS', 'BL_CK B_ST_RDS (Deluxe Edition)', 'Black Bastards Ruffs + Rares']
_LaurynHill = ['J. Period Presents... Best of Lauryn Hill, Vol. 1: Fire ', 'MTV Unplugged No. 2.0', 'Nina Revisited: A Tribute to Nina Simone', 'The Miseducation of Lauryn Hill']
_LilNasX = ['MONTERO', 'LNX2*', '7', 'NASARATI']
_LilUziVert = ['Luv Is Rage 3', 'Forever Young', 'Luv Is Rage 2', 'Luv Is Rage', 'Pink Tape', 'DP vs. The World', 'The Real Uzi', 'Lil Uzi Vert vs. The World', 'Eternal Atake', 'Barter 16', 'Eternal Atake (Deluxe) - LUV vs. The World 2', 'RED & WHITE', 'Purple Thoughtz EP, Vol. 1', 'The Perfect LUV Tape']
_LilWayne = ['The Dedication', 'The Drought is Over 4: Return to the Carter 3 Sessions', 'No Ceilings 3 [A-Side]', 'Dedication 6: Reloaded', 'Tha Carter III (Deluxe)', 'The Carter 2 Part 2: Like Father, Like Son', '500 Degreez', 'Dedication 6', 'Tha Carter IV', 'The Suffix ', 'Lights Out', 'Dedication 4', 'Dedication 3', 'Sorry 4 the Wait 2', 'I Can’t Feel My Face-Off (Unofficial)', 'The W. Carter Collection', 'Tha Carter', 'Dedication 5', 'Young Money: The Mixtape, Vol. 1', 'Funeral (Deluxe)', 'The W. Carter Collection 2 ', 'Lil Weezy Ana', 'Rebirth', 'The Drought is Over 5: Grand Closing', 'No Ceilings 2', 'Da Drought 3', 'I Am Not a Human Being II', 'No Ceilings 3 [B-Side] ', 'Funeral', 'No Ceilings', 'Tha Carter V (2014 Version)', 'Tha Carter II', 'The Drought is Over 6: The Reincarnation', 'I Am Not a Human Being', 'Tha Carter Screwed And Chopped', 'Dedication 2', 'Tha Carter V', 'The Drought is Over 2: The Carter 3 Sessions', 'Tha Block Is Hot', 'Da Drought ', 'My Face Can’t Be Felt (Unofficial)', 'The Prefix', 'Tha Carter V (Deluxe)', 'FWA', 'Sorry 4 the Wait', 'It’s Weezy Baby']
_LilYachty = ['LIL BOAT’S BIRTHDAY MIX', 'BIRTHDAY MIX 6', 'Lil Boat 2', 'Birthday Mix 3', 'Teenage Emotions', 'BIRTHDAY MIX 4', 'Summer Songs 2', 'Birthday Mix 5', 'Lil Boat 3', 'Michigan Boy Boat', 'Birthday Mix 2.0', 'Let’s Start Here.', 'Nuthin’ 2 Prove', 'Lil Boat ']
_LilKim = ['Ms. G.O.A.T.', 'Black Friday', 'The Naked Truth', 'The Notorious KIM', '9', 'The Notorious K.I.M. (The Lost Tapes)', 'Hard Core', 'Lil Kim Season', 'La Bella Mafia', 'Hard Core 2K14']
_Logic = ['YSIV', 'Everybody', 'Inglorious Basterd', 'Young Sinatra', 'College Park', 'Bobby Tarantino', 'CyberPunk Prince', 'Bobby Tarantino II', 'Young Sinatra: Undeniable', 'Young Sinatra: Welcome to Forever', 'No Pressure', 'Psychological: The Mixtape', 'Vinyl Days', 'Under Pressure (Deluxe Version)', 'Under Pressure (Commentary Version)', 'Young, Broke, and Infamous', 'PEANUTS.', 'The Incredible True Story', 'Supermarket (Soundtrack)', 'Confessions of a Dangerous Mind', 'Bobby Tarantino III', 'Ultra 85', 'Louis Vuitton Briefcase']
_LupeFiasco = ['Fahrenheit 1/15 Part III: A Rhyming Ape', 'Lupe the Jedi', 'Lupe Fiasco: Before There Were Lasers', 'Tetsuo & Youth', 'Extra Lasers', 'DROGAS WAVE', 'Lasers', 'Lupe Fiasco’s Food & Liquor (Advance/Pre-Release Leak)', 'Enemy of the State: A Love Story', 'Food & Liquor II: The Great American Rap Album, Pt. 1', 'DRILL MUSIC IN ZION', 'Lupe Fiasco’s The Cool', 'Fahrenheit 1/15: The Truth is Among Us', 'DROGAS Light', 'Fahrenheit 1/15 Part II: Revenge of the Nerds', 'Lupe Fiasco’s Food & Liquor', 'Friend of the People: I Fight Evil', 'Pharaoh Height 2/30  ', 'Teriyaki Joe: Neo-Harlem Detective']
_MacMiller = ['GO:OD AM ', 'Macadelic', 'Swimming', 'Faces', 'The High Life', 'Best Day Ever', 'K.I.D.S.', 'I Love Life, Thank You', 'The Jukebox: Prelude to Class Clown', 'Watching Movies with the Sound Off', 'But My Mackin Ain’t Easy', 'Live from Space', 'Live from London', 'Blue Slide Park', 'Circles', 'Sour Hour', 'Balloonerism*', 'Black Friday', 'The Divine Feminine', 'But My Mackin’ Ain’t Easy (Original Version)']
_MastaAce = ['The Falling Season', 'Take a Look Around', 'Disposable Arts', 'Richmond Hill', 'Arts and Entertainment', 'A Long Hot Summer', 'Hits U Missed', 'MA_DOOM: Son of Yvonne', 'Son of Yvonne (Remix Intrumentals)']
_MeganTheeStallion = ['Suga', 'Suga (Chopnotslop Remix)', 'Something For Thee Hotties', 'Traumazine', 'Fever', 'Tina Snow – EP', 'Rich Ratchet', 'Good News', 'Make It Hot', 'Make It Hot (ChopNotSlop Remix) - EP']
_MFDOOM = ['BORN LIKE THIS', 'Metal Fingers Presents: Special Herbs, Vol. 3', 'Mm..LeftOvers', 'Metal Fingers Presents: Special Herbs, Vol. 1 & 2', 'Special Blends Volume 1 & 2', 'Metal Fingers Presents: Special Herbs, Vol. 1', 'Operation: Doomsday', 'MM..FOOD', 'Metal Fingers Presents: Special Herbs, Vol. 9 & 0', 'Metalfingers Presents: Special Herbs, Vols. 4, 5 & 6', 'Metal Fingers Presents: Special Herbs, Vol. 3 & 4', 'Metal Fingers Presents: Special Herbs, Vol. 7 & 8', 'Metal Fingers Presents: Special Herbs, The Box Set Vol. 0-9 (Disc 3)', 'Operation: Doomsday (Complete)', 'Gazzillion Ear']
_Migos = ['No Label', 'No Label II', 'Young Rich Niggas', 'Still on Lock', 'Rich Nigga Timeline', 'Culture', 'Culture III', 'Back to the Bando', 'Culture II ', 'YRN 2 (Young Rich Niggas 2)', 'Streets on Lock', 'Yung Rich Nation', 'Juug Season', 'Streets On Lock 4', 'Streets on Lock 3']
_MissyElliott = ['The Cookbook', 'Miss E ...So Addictive', 'Misdemeanor', 'Under Construction', 'Da Real World', 'This Is Not a Test!', 'Supa Dupa Fly ']
_MobbDeep = ['The Mobb Files', 'Murda Muzik', 'The Infamous Mobb Deep', 'The Infamous Archives', 'Amerikaz Nightmare', 'Hell On Earth', 'Blood Money', 'White Cocaine 2', 'Juvenile Hell', 'Black Cocaine', 'Infamy', 'Free Agents: The Murda Mixtape', 'The Safe Is Cracked', 'The Infamous', 'The Infamous (Demo Tape)']
_Nas = ['King’s Disease', 'Magic 3', 'Stillmatic', 'King’s Disease III', 'Magic', 'Magic (Instrumentals)', 'King’s Disease II', 'Magic 2', 'Life Is Good', 'NASIR', 'The Lost Tapes 2', 'It Was Written', 'I Am... The Autobiography', 'I Am...', 'Untitled', 'Street’s Disciple', 'The Nigger Tape', 'Nastradamus', 'Hip Hop Is Dead', 'The Lost Tapes', 'God’s Son', 'It Was... Remixed (Rare Unreleased & Remixed)', 'Death Of Escobar', 'Illmatic', 'Illmatic XX', 'Illmatic: Live from the Kennedy Center with the National Symphony Orchestra', 'Illmatic 10th Anniversary Platinum Edition']
_NipseyHussle = ['The Marathon Continues', 'The Leaks, Vol. 1', 'Crenshaw', 'Mailbox Money', 'The Marathon', 'Bullets Ain’t Got No Name, Vol. 1', 'Slauson Boy 2', 'Bullets Ain’t Got No Name, Vol. II', 'Bullets Ain’t Got No Name, Vol. 3', 'The Marathon Continues: X-Tra Laps', 'Victory Lap', 'Slauson Boy, Vol. 1', 'NH2*']
_OutKast = ['Stankonia', 'ATLiens', 'Idlewild', 'Southernplayalisticadillacmuzik', 'Speakerboxxx / The Love Below', 'Aquemini', 'ATLiens / Wheelz of Steel', 'ATLiens (25th Anniversary Deluxe Edition)', 'Elevators (Me & You)', 'Jazzy Belle', 'Player’s Ball']

In [65]:
_PlayboiCarti = ['Young Mi\u200b$\u200bfit', 'PC3*', 'Whole Lotta Red', 'Die Lit', 'Playboi Carti']
_PopSmoke = ['PS3*', 'Faith', 'Shoot for the Stars Aim for the Moon', 'Meet The Woo 2', 'Shoot for the Stars Aim for the Moon (Deluxe)', 'Faith (Deluxe)', 'Meet the Woo']
_PublicEnemy = ['Apocalypse 91... The Enemy Strikes Black', 'The Evil Empire of Everything', 'There’s a Poison Goin’ On', 'Revolverlution', 'Fear of a Black Planet', 'Muse Sick-N-Hour Mess Age', 'Greatest Misses', 'How You Sell Soul to a Soulless People Who Sold Their Soul???', 'New Whirl Odor', 'Nothing Is Quick In The Desert', 'It Takes a Nation of Millions to Hold Us Back', 'Bring The Noise Remix', 'Rebirth of a Nation', 'Most of My Heroes Still Don’t Appear on No Stamp', 'Man Plans God Laughs', 'What You Gonna Do When the Grid Goes Down?', 'He Got Game', 'Yo! Bum Rush the Show']
_PushaT = ['My Name Is My Name', 'Fear of God', 'Fear of God II: Let Us Pray', 'Wrath of Caine', 'It’s Almost Dry', 'DAYTONA', 'King Push – Darkest Before Dawn: The Prelude', 'Spotify Sessions']
_QueenLatifah = ['Black Reign', 'All Hail the Queen', 'The Dana Owens Album', 'Order In The Court', 'Trav’lin’ Light', 'Nature of a Sista']
_RunDMC = ['Run–D.M.C.', 'Down With the King', 'Crown Royal', 'Back from Hell', 'Tougher Than Leather', 'King of Rock', 'Raising Hell']
_Scarface = ['The Last of a Dying Breed', 'My Homies', 'Deeply Rooted', 'Mr. Scarface is Back', 'Made', 'Balls And My Word', 'Deeply Rooted: The Lost Files ', 'Emeritus', 'The World is Yours', 'The Untouchable', 'The Fix', 'The Diary', 'My Homies Part 2']
_SlickRick = ['The Art of Storytelling', 'Behind Bars', 'The Great Adventures of Slick Rick (Deluxe Edition)', 'The Ruler’s Back', 'The Great Adventures of Slick Rick']
_SnoopDogg = ['Tha Blue Carpet Treatment', 'Malice ’N Wonderland', 'Stoner’s EP', 'Tha Doggfather', 'Da Game Is to Be Sold, Not to Be Told', 'No Limit Top Dogg', '220 - EP', 'Make America Crip Again - EP', 'Neva Left', 'Coolaid', 'That’s My Work 3', 'West Coast Ridah', 'Doggystyle', 'That’s My Work 2', 'Bible of Love', 'Paid Tha Cost To Be Da Bo$$', 'welcome 2 tha chuuch...vol. 3', 'BUSH', 'Ego Trippin’', 'Tha Last Meal', 'Tha Blue Carpet Treatment Mixtape', 'R&G (Rhythm & Gangsta): The Masterpiece', 'Metaverse: The NFT Drop, Vol. 2', 'I Wanna Thank Me', 'Doggumentary', 'BODR', 'From tha Streets 2 tha Suites', 'Smokefest Underground', 'Dead Man Walkin', 'Death Row’s Snoop Doggy Dogg: Greatest Hits', 'Death Row: The Lost Sessions Vol. 1', 'More Malice', 'Snoop Dogg Presents Algorithm', 'Sensual Seduction']
_TalibKweli = ['We Run This, Vol. 7', 'The Beautiful Mix CD', 'Gutter Rainbows', 'The Beautiful Struggle', 'Radio Silence', 'Javotti Media Presents: The Cathedral', 'Gravitas', 'Gravitas (DJ Mix)', 'Fuck the Money', 'Prisoner of Conscious', 'Eardrum', 'Right About Now: The Official Sucka Free Mix CD', 'The Beautiful Mixtape Vol. 2: The Struggle Continues', 'Quality']
_TheNotoriousBIG = ['Duets: The Final Chapter', 'Life After Death', 'Born Again', 'Ready To Die', 'Life After Death (25th Anniversary Super Deluxe Edition)', 'The Notorious B.I.G. Sampler (Teaser)']
_ThePharcyde = ['Bizarre Ride II the Pharcyde', 'Bizarre Ride II the Pharcyde (25th Anniversary Edition)', 'Labcabincalifornia (Deluxe Edition)', 'Plain Rap']
_TravisScott = ['Owl Pharaoh', 'Rodeo (Expanded Edition)', 'ASTROWORLD', 'Days Before Rodeo', 'Birds in the Trap Sing McKnight', 'UTOPIA', 'TS5*']
_VinceStaples = ['Summertime ’06', 'Shyne Coldchain Vol. 1', 'Shyne Coldchain II', 'FM!', 'Hell Can Wait', 'Big Fish Theory', 'Vince Staples', 'RAMONA PARK BROKE MY HEART', 'Generic', 'Prima Donna', 'Winter in Prague']
_WuTangClan = ['8 Diagrams', 'A Better Tomorrow', 'Wu-Tang Forever', 'Iron Flag', 'The Saga Continues', 'Enter the Wu-Tang (36 Chambers)', 'Wu-Tang Demo Tape', 'The W', 'Of Mics and Men (Music from the Showtime Documentary Series)', 'Protect Ya Neck – Single']
_YasiinBey = ['The Ecstatic', 'The New Danger', 'Negus', 'Black on Both Sides', 'True Magic']
_YoungThug = ['1017 Thug 2', '1017 Thug', 'YT&G1*', 'Rich Gang: Tha Tour, Pt. 1', '1017 Thug 3: The Finale', 'I Came from Nothing 2', 'BUSINESS IS BUSINESS', 'I Came from Nothing', 'I Came from Nothing 3', 'Slime Season', 'Tha Tour Pt. 2', 'Barter 6', 'So Much Fun', 'Slime Season 2', 'I’m Up', 'Punk', 'Purple Album', 'On the Rvn', 'BEAUTIFUL THUGGER GIRLS', 'Slime Season 3', 'JEFFERY']

In [66]:
list_albums = (_21Savage + _50Cent + _ATribeCalledQuest + _A_APRocky + _AesopRock +
               _Atmosphere + _AzealiaBanks + _BabyKeem + _BigDaddyKane + _BigL +
               _BigPun + _BigSean + _CardiB + _ChanceTheRapper + _ChiefKeef +
               _ChildishGambino + _Common + _Cordae + _CypressHill + _DannyBrown +
               _DeLaSoul + _DMX + _DojaCat + _DrDre + _Drake + _EarlSweatshirt +
               _Eminem + _EricBRakim + _Future + _GhostfaceKillah + _GucciMane +
               _Hopsin + _IceCube + _IceSpice + _JCole + _JackHarlow + _JAYZ +
               _Jeezy + _JoeyBadass + _JoynerLucas + _JuiceWRLD + _KanyeWest +
               _KendrickLamar + _KidCudi + _KMD + _LaurynHill + _LilNasX +
               _LilUziVert + _LilWayne + _LilYachty + _LilKim + _Logic +
               _LupeFiasco + _MacMiller + _MastaAce + _MeganTheeStallion +
               _MFDOOM + _Migos + _MissyElliott + _MobbDeep + _Nas + _NipseyHussle +
               _OutKast + _PlayboiCarti + _PopSmoke + _PublicEnemy + _PushaT +
               _QueenLatifah + _RunDMC + _Scarface + _SlickRick + _SnoopDogg +
               _TalibKweli + _TheNotoriousBIG + _ThePharcyde + _TravisScott +
               _VinceStaples + _WuTangClan + _YasiinBey + _YoungThug)

### Final list of prevailing albums:

In [67]:
songs_metadata_df = songs_metadata_df[songs_metadata_df['album'].isin(list_albums)]
songs_metadata_df

Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id,artist
116,Ghost,Resurrection,2003,[],"[DJ Pooh, Eminem]","[2Pac, Eminem, Luis Resto, DJ Pooh]",5851,49,2Pac
186,Intro (Tupac: Resurrection),Resurrection,2003,[],[2Pac],[2Pac],4064280,49,2Pac
261,One Day At A Time [Em’s Version],Resurrection,2003,"[Eminem, Outlawz]","[Hen-Gee, Eminem]","[2Pac, Eminem, Luis Resto, Kastro, Hen-Gee, Yo...",869,49,2Pac
296,Runnin’ (Dying to Live),Resurrection,2003,[The Notorious B.I.G.],"[Eminem, Easy Mo Bee]","[2Pac, The Notorious B.I.G., Easy Mo Bee, Luis...",367,49,2Pac
355,The Realist Killaz,Resurrection,2003,[50 Cent],[Red Spyda],"[2Pac, 50 Cent]",3105,49,2Pac
...,...,...,...,...,...,...,...,...,...
15926,Yea Yea Yea,Punk,2021,[],"[Hitmaka Millz, Charlie Handsome, Wheezy, Rex ...","[Rex Kudo, Wheezy, Charlie Handsome, Hitmaka M...",7290665,764670,Young Thug
15927,You Said,BEAUTIFUL THUGGER GIRLS,2017,[Quavo],[Wheezy],"[Zhonzell Watson, Quavo, Wheezy, Young Thug]",3121328,336817,Young Thug
15928,You the Best,Purple Album,2014,[Skooly],[Tino Burna],"[Skooly, Young Thug]",455618,105179,Young Thug
15929,You the World,1017 Thug 3: The Finale,2014,[],[C4Bombs],"[C4Bombs, Young Thug]",507566,111193,Young Thug


Exporting the DF

In [115]:
# csv_file_path = "data/songs_metadata.csv"
# songs_metadata_df.to_csv(csv_file_path, index=False)

 ## Fetching lyrics and store them in a txt in the data folder

In [None]:
#ok so basically this function will clean the names as I was having troubles loading/saving files that
#do not start with a letter or "_", then it will iterate through all the unique ID rows in the songs_metadata_df
#to fetch the lyrics for each artist by giving a sleep time in between each fetch to avoid overrequesting.
#between each 60 fetches it will take a deep breath of 5 seconds cause my laptop was burning and again, overrequesting.
#I also tried, half-successfully to work around timeout issues by waiting 15 minutes whenever I would get a 4XX error.
#for each song, the fetch will perform 3 attempts until giving up and jumping on to the next row
#at each instance the function will store the index of the dataframe it left off at in a "chekpoint.txt" file in the main folder
#so that if I get an error and rerun the code, it will start from that index

def sanitize_title(title):
    # Replace special characters with "_"
    special_characters = '*;:!%’][=)(?/\}{"@£§$€'
    for char in special_characters:
        title = title.replace(char, '_')
    return title

api_key = genius_client_token
genius_api = lg.Genius(api_key, skip_non_songs=True, excluded_terms=["(Remix)", "(Live)"], remove_section_headers=True)

# Number of retries before giving up on a row
max_retries = 3

# Sleep time after processing a certain number of rows
rows_between_sleep = 60

# Time to wait (in seconds) after encountering a 4XX error
api_limit_wait_time = 900  # 15 minutes

# Load the last successfully processed index from a file (if exists)
checkpoint_file_path = 'checkpoint.txt'
if os.path.exists(checkpoint_file_path):
    with open(checkpoint_file_path, 'r') as checkpoint_file:
        last_processed_index = int(checkpoint_file.read())
else:
    last_processed_index = 0

# Iterate over rows starting from the last successfully processed index
for index, row in songs_metadata_df.iloc[last_processed_index:].iterrows():
    artist_name = row['artist']
    song_title = row['title']
    genius_track_id = row['genius_track_id']

    # Instantiate the retries
    retries = 0

    while retries < max_retries:
        try:
            search_result = genius_api.search_song(song_title, artist_name)
        except Timeout as e:
            # Handle timeout exception
            print(f"\n❌---------------Request timed out for {song_title} - {genius_track_id}, retrying ({retries}/{max_retries})---------------❌\n")
            retries += 1
            time.sleep(1)  # Sleep for 1 second between retries to avoid overrequesting

            # If max_retries reached, sleep for api_limit_wait_time before continuing
            if retries == max_retries:
                print(f"\n❌---------------Reached max retries for {song_title} - {genius_track_id}, waiting {api_limit_wait_time} seconds before continuing---------------❌\n")
                time.sleep(api_limit_wait_time)

        except lg.GeniusException as e:
            # Handle other Genius exceptions [this is the part that was half-successful, once again: READ THE DOCUMENTATION]
            print(f"\n❌---------------An error occurred for {song_title} - {genius_track_id}: {e}, retrying ({retries}/{max_retries})---------------❌\n")
            retries += 1
            time.sleep(1)  # Sleep for 1 second between retries

        #to checl if there are search results
        if search_result:
            # Get the lyrics
            lyrics = search_result.lyrics

            # title cleaning once again, example A$AP ROCKY = A_AP ROCKY
            sanitized_title = sanitize_title(song_title)

            #here I'm basically saying what the name of the .txt should be "_songID_title_artist"
            file_path = os.path.join('data', f"__{genius_track_id}_{sanitized_title}_{artist_name}.txt")

            #WRITING the lyrics to the .txt file
            try:
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(lyrics)
                print(f"\n✔️---------------{song_title} - {genius_track_id} printed successfully---------------✔️")
                print(f"✔️---------------File created: {file_path}---------------✔️\n")
            except Exception as e:
                print(f"\n❌---------------Error writing file for {song_title} - {genius_track_id}: {e}---------------❌\n")

            # Break out of the retry loop if successful
            break
        else:
            retries += 1
            time.sleep(1)  # Sleep for 1 second between retries (OVERREACTING!!)

    time.sleep(1)  # Sleep for 1 second between rows (OVERREACTING!!)

    # Check if we reached the specified number of rows between longer sleeps
    if (index + 1) % rows_between_sleep == 0:
        print(f"\n---------------Sleeping for 5 seconds after processing {index + 1} rows---------------\n")
        time.sleep(5)  # Sleep for 5 seconds after processing a certain number of rows

    # Save the current index as the last successfully processed index - this for me to 
    # be able to restart from where I left in terms of row index instead of starting from scratch whenever I have a bug/error
    with open(checkpoint_file_path, 'w') as checkpoint_file:
        checkpoint_file.write(str(index))


Searching for "DAS CAP" by Lil Yachty...
Done.

✔️---------------DAS CAP - 3565033 printed successfully---------------✔️
✔️---------------File created: data\__3565033_DAS CAP_Lil Yachty.txt---------------✔️


---------------Sleeping for 5 seconds after processing 9360 rows---------------

Searching for "Demon Time" by Lil Yachty...
Done.

✔️---------------Demon Time - 5614122 printed successfully---------------✔️
✔️---------------File created: data\__5614122_Demon Time_Lil Yachty.txt---------------✔️

Searching for "Dinner’s Ready" by Lil Yachty...
Done.

✔️---------------Dinner’s Ready - 3920507 printed successfully---------------✔️
✔️---------------File created: data\__3920507_Dinner_s Ready_Lil Yachty.txt---------------✔️

Searching for "DipSet" by Lil Yachty...
Done.

✔️---------------DipSet - 2823617 printed successfully---------------✔️
✔️---------------File created: data\__2823617_DipSet_Lil Yachty.txt---------------✔️

Searching for "Dirty Mouth" by Lil Yachty...
Done.

✔️-----