In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import gc
from transformers import pipeline

model_ckpt = "papluca/xlm-roberta-base-language-detection"
pipe = pipeline("text-classification", model=model_ckpt, device=-1)

sns.set_theme()

In [3]:
def identify_language(lyrics: str) -> str|np.nan:
    res = pipe([lyrics], truncation=True, max_length=128)
    return res[0]['label'] if res[0]['score'] > 0.5 else np.nan

In [None]:
with pd.read_csv(
    "../data/raw/song_lyrics.csv",
    chunksize=5 * 10**4,
    usecols=["title", "artist", "year", "tag", "views", "lyrics"],
    dtype={"year": np.int16, "views": np.int32}
) as chunks:

    for idx, chunk in enumerate(chunks):
        print(f"Processing chunk {idx}")

        # drop N.A. lyrics
        chunk = chunk.dropna(subset=["lyrics"])
        
        # drop romanizations
        chunk = chunk[chunk["artist"] != "Genius Romanizations"]
        chunk = chunk[~chunk["title"].str.contains(r"\(?romanized\)?", regex=True, na=False, case=False)]

        # remove invalid years
        chunk = chunk[chunk["year"] < 2023]
        
        # remove duplicated entries
        chunk = chunk.drop_duplicates(subset=["title", "artist", "year"])
        
        # remove special characters from lyrics
        pattern = r"(?m)^\[.*?\]$"
        chunk["lyrics"] = chunk["lyrics"].str.replace(pattern, "", regex=True)
        
        # remove empty lines
        pattern = r"\n|\n\n"
        chunk["lyrics"] = chunk["lyrics"].str.replace(pattern, " ", regex=True)

        # drop lyrics that are too short or too long
        chunk = chunk[chunk["lyrics"].str.len().between(10**2, 10**5)]

        # analyze language
        chunk["language"] = chunk["lyrics"].apply(identify_language)
        print(f'{len(chunk[chunk["language_cld3"].isna()])} not identified lyrics using by the language detection model.')
        
        # drop non-english lyrics
        chuck = chunk[chunk["language"] == "en"][["artist", "tag", "lyrics"]]
        
        # save processed data
        chunk.to_csv("../data/processed/lyrics_processed.csv", mode="a", header=not os.path.exists("../data/processed/lyrics_processed.csv"), index=False)
        
        del chunk
        gc.collect()