# Clean Lyrics Pipeline

This notebook provides a reusable, well-documented pipeline for:
1. Loading and cleaning lyrics JSON exports.
2. Batch-processing all lyric files into a unified Excel.
3. (New) Merging cleaned lyrics with an existing Excel dataset.

In [1]:
# 1. Import libraries
import json
import re
from pathlib import Path
import pandas as pd

## 2. Define helper functions

In [2]:
def load_json(file_path: Path) -> dict:
    """
    Load a JSON file from disk and return its content as a Python dict.

    Parameters
    ----------
    file_path : Path
        Path to the JSON file.

    Returns
    -------
    dict
        Parsed JSON data.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
        data_dict = json.loads(text)

        return data_dict

In [17]:
def extract_lyrics(data: dict) -> str:
    """
    Extract the lyrics portion from the raw JSON data.
    Splits on the 'Lyrics' header; if not found, returns entire text.

    Parameters
    ----------
    data : dict
        Dictionary containing at least a 'lyrics' key.

    Returns
    -------
    str
        Raw lyrics string (potentially with line breaks and tags).
    """
    lyrics = data["lyrics"].split("Lyrics")[1]
    # if theres an intro to the lyrics - remove
    if "Read More" in lyrics:
        return lyrics.split("Read More")[1]
    return lyrics

In [12]:
def clean_lyrics(text: str) -> str:
    """
    Clean a raw lyrics string by:
      - Removing parenthetical content (e.g., '(Yeah)').
      - Replacing newline characters with spaces.
      - Collapsing multiple whitespace into a single space.
      - Stripping leading/trailing whitespace and lowercasing.

    Parameters
    ----------
    text : str
        Raw lyrics text.

    Returns
    -------
    str
        Cleaned, single-line lyrics string.
    """
    # 1. Remove any text in parentheses
    no_paren = re.sub(r"\([^)]*\)", " ", text)
    # 2. Replace line breaks with spaces
    single_line = no_paren.replace("\n", " ")
    # 3. Collapse multiple spaces
    compact = re.sub(r"\s+", " ", single_line)
    # 4. Trim and lowercase
    return compact.strip().lower()

## 3. File-level processing

In [5]:
def process_lyric_file(file_path: Path) -> dict:
    """
    Process one JSON lyric file into a cleaned record.

    Parameters
    ----------
    file_path : Path
        Path to the raw JSON file.

    Returns
    -------
    dict
        A dictionary with 'file_name' and 'lyrics_clean'.
    """
    data = load_json(file_path)
    raw_lyrics = extract_lyrics(data)
    cleaned = clean_lyrics(raw_lyrics)
    return cleaned

In [6]:
# Testing with one file

dir_path = Path("../data/raw/lyrics")

files = sorted(dir_path.glob("*.json"))

first_file = files[5]
print("First file is:", first_file)

print(process_lyric_file(first_file))

First file is: ..\data\raw\lyrics\2_hands__tate_mcrae.json


## 4. Import song dataset

In [19]:
billboard_dataset_path = Path("../data/raw/billboard/billboard_dataset_2024_with_id.xlsx")
df_songs = pd.read_excel(billboard_dataset_path)

df_songs

Unnamed: 0,date,rank,title,artist,image,peakPos,lastpos,weeks,isNew,song_id
0,2024-12-28,1,All I Want For Christmas Is You,Mariah Carey,https://charts-static.billboard.com/img/1994/1...,1,1,70,False,all_i_want_for_christmas_is_you__mariah_carey
1,2024-12-28,2,Rockin' Around The Christmas Tree,Brenda Lee,https://charts-static.billboard.com/img/1960/1...,1,2,63,False,rockin_around_the_christmas_tree__brenda_lee
2,2024-12-28,3,Last Christmas,Wham!,https://charts-static.billboard.com/img/1998/0...,3,4,44,False,last_christmas__wham
3,2024-12-28,4,Jingle Bell Rock,Bobby Helms,https://charts-static.billboard.com/img/1958/1...,3,3,60,False,jingle_bell_rock__bobby_helms
4,2024-12-28,5,A Holly Jolly Christmas,Burl Ives,https://charts-static.billboard.com/img/1998/0...,4,5,44,False,a_holly_jolly_christmas__burl_ives
...,...,...,...,...,...,...,...,...,...,...
5195,2024-01-06,96,El Amor de Su Vida,Grupo Frontera & Grupo Firme,https://charts-static.billboard.com/img/2023/0...,68,0,16,False,el_amor_de_su_vida__grupo_frontera__grupo_firme
5196,2024-01-06,97,Standing Next To You,Jung Kook,https://charts-static.billboard.com/img/2023/1...,5,79,8,False,standing_next_to_you__jung_kook
5197,2024-01-06,98,Man Made A Bar,Morgan Wallen Featuring Eric Church,https://charts-static.billboard.com/img/2023/0...,15,0,14,False,man_made_a_bar__morgan_wallen_featuring_eric_c...
5198,2024-01-06,99,Que Onda,Calle 24 x Chino Pacas x Fuerza Regida,https://charts-static.billboard.com/img/2023/0...,61,98,13,False,que_onda__calle_24_x_chino_pacas_x_fuerza_regida


## 5. for each song - we'll extract lyrics and add to a new lyrics column

In [21]:
def _get_cleaned(song_id: str) -> str:
    
    file = Path(f"../data/raw/lyrics/{song_id}.json")
    return process_lyric_file(file)

In [24]:
df = df_songs.copy()
df["lyrics"] = df["song_id"].apply(_get_cleaned)

In [25]:
df

Unnamed: 0,date,rank,title,artist,image,peakPos,lastpos,weeks,isNew,song_id,lyrics
0,2024-12-28,1,All I Want For Christmas Is You,Mariah Carey,https://charts-static.billboard.com/img/1994/1...,1,1,70,False,all_i_want_for_christmas_is_you__mariah_carey,i don't want a lot for christmas there is just...
1,2024-12-28,2,Rockin' Around The Christmas Tree,Brenda Lee,https://charts-static.billboard.com/img/1960/1...,1,2,63,False,rockin_around_the_christmas_tree__brenda_lee,rockin' around the christmas tree at the chris...
2,2024-12-28,3,Last Christmas,Wham!,https://charts-static.billboard.com/img/1998/0...,3,4,44,False,last_christmas__wham,"ah, ah-ah ooh-woah oh-oh last christmas, i gav..."
3,2024-12-28,4,Jingle Bell Rock,Bobby Helms,https://charts-static.billboard.com/img/1958/1...,3,3,60,False,jingle_bell_rock__bobby_helms,"jingle bell, jingle bell, jingle bell rock jin..."
4,2024-12-28,5,A Holly Jolly Christmas,Burl Ives,https://charts-static.billboard.com/img/1998/0...,4,5,44,False,a_holly_jolly_christmas__burl_ives,ding-dong-ding ding-dong-ding have a holly jol...
...,...,...,...,...,...,...,...,...,...,...,...
5195,2024-01-06,96,El Amor de Su Vida,Grupo Frontera & Grupo Firme,https://charts-static.billboard.com/img/2023/0...,68,0,16,False,el_amor_de_su_vida__grupo_frontera__grupo_firme,si estoy tomando es porque estoy echando alcoh...
5196,2024-01-06,97,Standing Next To You,Jung Kook,https://charts-static.billboard.com/img/2023/1...,5,79,8,False,standing_next_to_you__jung_kook,standing next to you play me slow push up on t...
5197,2024-01-06,98,Man Made A Bar,Morgan Wallen Featuring Eric Church,https://charts-static.billboard.com/img/2023/0...,15,0,14,False,man_made_a_bar__morgan_wallen_featuring_eric_c...,"i sat down on a barstool, like a dern fool 'ca..."
5198,2024-01-06,99,Que Onda,Calle 24 x Chino Pacas x Fuerza Regida,https://charts-static.billboard.com/img/2023/0...,61,98,13,False,que_onda__calle_24_x_chino_pacas_x_fuerza_regida,"baby, me vuelves loco no se esperó al hotel y ..."


In [27]:
out_path = "../data/processed/dataset.xlsx"

df.to_excel(out_path, index=False, engine="openpyxl")

print(f"Saved upgraded Excel to {out_path}")

Saved upgraded Excel to ../data/processed/dataset.xlsx


In [41]:
df[df["lyrics"]==""][["title", "artist"]]

Unnamed: 0,title,artist
33,Sticky,"Tyler, The Creator Featuring GloRilla, Sexyy R..."
59,Hey Now,Kendrick Lamar Featuring Dody6
135,Sticky,"Tyler, The Creator Featuring GloRilla, Sexyy R..."
151,Hey Now,Kendrick Lamar Featuring Dody6
195,GNX,"Kendrick Lamar Featuring Hitta J3, YoungThreat..."
231,Hey Now,Kendrick Lamar Featuring Dody6
232,Sticky,"Tyler, The Creator Featuring GloRilla, Sexyy R..."
266,GNX,"Kendrick Lamar Featuring Hitta J3, YoungThreat..."
304,Hey Now,Kendrick Lamar Featuring Dody6
323,GNX,"Kendrick Lamar Featuring Hitta J3, YoungThreat..."
