In [1]:
import pandas as pd
import numpy as np
import gc
import os
from transformers import pipeline

model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt, device=-1)

In [3]:
# this code identifies the language of the lyrics
def identify_language(lyrics: str) -> str|np.nan:
    res = pipe([lyrics], truncation=True, max_length=128)
    return res[0]['label'] if res[0]['score'] > 0.5 else np.nan

In [None]:
# this can take more than 12 hours to run
with pd.read_csv(
    "../data/raw/song_lyrics.csv",
    chunksize=5 * 10**4,
    usecols=["title", "artist", "year", "tag", "views", "lyrics"],
    dtype={"year": np.int16, "views": np.int32}
) as chunks:

    for idx, chunk in enumerate(chunks):
        print(f"Processing chunk {idx}")

        # drop N.A. lyrics
        chunk = chunk.dropna(subset=["lyrics"])
        
        # drop romanizations
        chunk = chunk[chunk["artist"] != "Genius Romanizations"]
        chunk = chunk[~chunk["title"].str.contains(r"\(?romanized\)?", regex=True, na=False, case=False)]

        # remove invalid years
        chunk = chunk[chunk["year"] < 2023]
        
        # remove duplicated entries
        chunk = chunk.drop_duplicates(subset=["title", "artist", "year"])
        
        # remove special characters from lyrics
        pattern = r"(?m)^\[.*?\]$"
        chunk["lyrics"] = chunk["lyrics"].str.replace(pattern, "", regex=True)
        
        # remove empty lines
        pattern = r"\n|\n\n"
        chunk["lyrics"] = chunk["lyrics"].str.replace(pattern, " ", regex=True)

        # drop lyrics that are too short or too long
        chunk = chunk[chunk["lyrics"].str.len().between(10**2, 10**5)]

        # analyze language
        chunk["language"] = chunk["lyrics"].apply(identify_language)
        print(f'{len(chunk[chunk["language_cld3"].isna()])} not identified lyrics using by the language detection model.')
        
        # drop non-english lyrics
        chuck = chunk[chunk["language"] == "en"][["artist", "tag", "lyrics"]]
        
        # save processed data
        chunk.to_csv("../data/processed/lyrics_processed.csv", mode="a", header=not os.path.exists("../data/processed/lyrics_processed.csv"), index=False)
        
        del chunk
        gc.collect()

In [2]:
# Read the semi-processed data
semi_processed = pd.read_csv("../data/processed/lyrics_processed.csv", index_col=0)

In [244]:
# Dropping the views column and renaming the tag and title columns
semi_processed_2 = semi_processed.drop(columns=["views"]).rename({"tag":"genre", "title":"song"}, axis=1)

In [40]:
# Defining some sets for min year, max years and best 100 artists
maxes = set()
mins = set()
best_100_artists = set()

In [68]:
# This function prints the artist, first song year, last song year, number of songs released, and the best 100 artists so far
# It gets the data for the artist from my own personal tastes, some websites and some artists I knew were popular
def artister(artist):
    if artist in semi_processed_2["artist"].unique() and artist not in best_100_artists:
        data_artist = semi_processed_2[semi_processed_2["artist"] == artist]
        maxes.add(data_artist.year.max())
        mins.add(data_artist.year.min())
        best_100_artists.add(artist)
        print(f"Artist: {artist}")
        print(f"First Song Released In: {data_artist.year.min()}")
        print(f"Last Song Released In: {data_artist.year.max()}")
        print(f"Number of Songs Released: {len(data_artist)}")
        print(f"So far: {best_100_artists}")
        print(f"So far this many: {len(best_100_artists)}")
        print(f"First Song Years: {mins}")
        print(f"First First Song Year: {min(mins)}")
        print(f"Last Song Years: {maxes}")
        print(f"Last Last Song Year: {max(maxes)}")
    else:
        print("No Data")

In [None]:
# After scrapping my own personal tastes, some websites and some artists I knew were popular, I came up with this list of 100 artists.
best_100_artists = {'Led Zeppelin', 'LSD', 'Jon Hopkins', 'Dolly Parton', 'Billy Joel', 'Backstreet Boys', 'Muze Sikk', 'Kanye West', 'SALES', 'Ray Charles', 'Gucci Mane', 'Rihanna', 'Aerosmith', 'Juice WRLD', 'Soulja Boy', 'Snoop Dogg', 'Bon Jovi', 'twenty one pilots', 'Alabama', 'Kid Cudi', 'The Drums', 'Eminem', 'alt-J', 'AC/DC', 'Of Monsters and Men', 'Madonna', 'James Brown', 'Britney Spears', 'Johnny Cash', 'Guided by Voices', 'Bruce Springsteen', 'The White Stripes', 'The Rolling Stones', 'Tyler', 'Arctic Monkeys', 'Shania Twain', '50 Cent', 'Eagles', 'The Game', 'Maroon 5', 'Caravan Palace', 'Emily Dickinson', 'LMFAO', 'Rod Stewart', 'Imagine Dragons', 'june', 'Sia', 'Daft Punk', 'Kendrick Lamar', 'Matthew Mole', 'Shakira', 'Jack Stauber', 'U2', 'Katy Perry', 'Pink Floyd', 'Glee Cast', 'Taylor Swift', 'Radiohead', 'Frank Zappa', 'Whitney Houston', 'Bob Dylan', 'Abraham Lincoln', 'The Beatles', 'Stevie Wonder', 'Lil B', 'Marshmello', 'Queen', 'Mariah Carey', 'Metallica', 'JP Saxe', 'Elton John', 'Noah Kahan', 'Macklemore', 'Harry Styles', 'Prince', 'Frank Sinatra', 'Ed Sheeran', 'J. Cole', 'Burial', 'Michael Jackson', '2Pac', 'Ella Fitzgerald', 'The Weeknd', 'Joji', 'The Grateful Dead', 'Bruno Mars', 'Shawn Mendes', 'Miley Cyrus', 'Lil Wayne', 'Adele', 'Nirvana', 'Clean Bandit', 'Avril Lavigne', "Guns N' Roses", 'Drake', 'Coldplay', 'Black Eyed Peas', 'Van Morrison', 'AURORA', 'Elvis Presley'}

In [245]:
# Filtering the data to only include the best 100 artists
semi_processed_3 = semi_processed_2[semi_processed_2['artist'].isin(best_100_artists)]

In [253]:
# Choosing only the 100 most popular artists
semi_processed_4 = semi_processed_3[semi_processed_3['year'] > 1950]
semi_processed_4 = semi_processed_4.astype({"year":np.int16})

In [260]:
# Dropping the misc genre
semi_processed_5 = semi_processed_4[semi_processed_4['genre'] != 'misc']

In [266]:
# Printing the length of each dataset
print(len(semi_processed_2))
print(len(semi_processed_3))
print(len(semi_processed_4))
print(len(semi_processed_5))

3373529
46575
45993
42803


In [269]:
semi_processed_5.genre.value_counts()

pop        13427
rock       12694
rap        11724
country     2713
rb          2245
Name: genre, dtype: int64

In [None]:
# So Lastly we have 100 artists, 5 genres (pop, rock, rap, country, rb), 72 years (ranging from 1951 to 2022) and 42,803 songs

In [272]:
# Saving the data
semi_processed_5.to_csv("../data/processed/lyrics_processed.csv", index=False)