In [1]:
# ──────────────────────────────────────────────────────────────
# 0. Imports & setup
# ──────────────────────────────────────────────────────────────
import re, string, pickle, pathlib
from collections import Counter, defaultdict
from itertools import islice
import pandas as pd
import networkx as nx
import spacy
from tqdm.auto import tqdm   # progress bars

# Load spaCy with everything off except tagger+lemmatizer
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [2]:
DATA_DIR   = pathlib.Path("../data/processed/")
IN_FILE    = DATA_DIR / "hot100_translated.xlsx"
OUT_DF     = DATA_DIR / "billboard_2024_clean.pkl"
WINDOW_SIZES = [3, 5, 10]    # tweak as you like

In [11]:
df = pd.read_excel(IN_FILE)
print(f"Loaded {len(df):,} songs")

Loaded 758 songs


In [5]:
full_df = pd.read_excel(DATA_DIR / "dataset.xlsx")
print(f"Loaded {len(full_df):,} songs")

Loaded 5,200 songs


In [7]:
# ── 2. Count how many times each song appears ──
freq = (
    full_df.groupby("song_id")
        .size()           # Series: song_id → count
        .rename("dup_count")
        .reset_index()
)

freq

Unnamed: 0,song_id,dup_count
0,020299__that_mexican_ot,3
1,16_carriages__beyonce,6
2,23__chayce_beckham,20
3,25__rod_wave,10
4,28__zach_bryan,20
...,...,...
756,your_place__ashley_cooke,4
757,youre_a_mean_one_mr_grinch__thurl_ravenscroft,1
758,youre_gonna_go_far__noah_kahan,1
759,youre_losing_me_from_the_vault__taylor_swift,3


In [12]:
df

Unnamed: 0,date,rank,title,artist,image,peakPos,lastpos,weeks,isNew,song_id,lyrics,orig_lang,lyrics_en
0,2024-12-28,1,All I Want For Christmas Is You,Mariah Carey,https://charts-static.billboard.com/img/1994/1...,1,1,70,False,all_i_want_for_christmas_is_you__mariah_carey,i don't want a lot for christmas there is just...,en,i don't want a lot for christmas there is just...
1,2024-12-28,2,Rockin' Around The Christmas Tree,Brenda Lee,https://charts-static.billboard.com/img/1960/1...,1,2,63,False,rockin_around_the_christmas_tree__brenda_lee,rockin' around the christmas tree at the chris...,en,rockin' around the christmas tree at the chris...
2,2024-12-28,3,Last Christmas,Wham!,https://charts-static.billboard.com/img/1998/0...,3,4,44,False,last_christmas__wham,"ah, ah-ah ooh-woah oh-oh last christmas, i gav...",en,"ah, ah-ah ooh-woah oh-oh last christmas, i gav..."
3,2024-12-28,4,Jingle Bell Rock,Bobby Helms,https://charts-static.billboard.com/img/1958/1...,3,3,60,False,jingle_bell_rock__bobby_helms,"jingle bell, jingle bell, jingle bell rock jin...",en,"jingle bell, jingle bell, jingle bell rock jin..."
4,2024-12-28,5,A Holly Jolly Christmas,Burl Ives,https://charts-static.billboard.com/img/1998/0...,4,5,44,False,a_holly_jolly_christmas__burl_ives,ding-dong-ding ding-dong-ding have a holly jol...,en,ding-dong-ding ding-dong-ding have a holly jol...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,2024-01-06,43,I Saw Mommy Kissing Santa Claus,Jackson 5,https://charts-static.billboard.com/img/1969/1...,43,0,1,True,i_saw_mommy_kissing_santa_claus__jackson_5,wow! mommy's kissing santa claus! i saw mommy ...,en,wow! mommy's kissing santa claus! i saw mommy ...
754,2024-01-06,46,Merry Christmas,Ed Sheeran & Elton John,https://charts-static.billboard.com/img/2021/1...,42,0,7,False,merry_christmas__ed_sheeran__elton_john,build the fire and gather 'round the tree fill...,en,build the fire and gather 'round the tree fill...
755,2024-01-06,50,(There's No Place Like) Home For The Holidays ...,Perry Como With Mitchell Ayers And His Orchestra,https://charts-static.billboard.com/img/2005/1...,50,0,1,True,theres_no_place_like_home_for_the_holidays_195...,"oh, there's no place like home for the holiday...",en,"oh, there's no place like home for the holiday..."
756,2024-01-06,87,Winter Wonderland,Chloe,https://charts-static.billboard.com/img/2023/1...,87,96,2,False,winter_wonderland__chloe,"walk walk, walk, walking walk, walk, walking w...",en,"walk walk, walk, walking walk, walk, walking w..."


In [13]:
# ── 3. Attach the counts to the unique table ──
df = df.merge(freq, on="song_id", how="left")
df["dup_count"].fillna(1, inplace=True)   # safety for any IDs missing in df
df["dup_count"] = df["dup_count"].astype(int)

df["dup_count"].describe()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["dup_count"].fillna(1, inplace=True)   # safety for any IDs missing in df


count    758.000000
mean       6.840369
std        8.377936
min        1.000000
25%        1.000000
50%        3.000000
75%       10.000000
max       52.000000
Name: dup_count, dtype: float64

In [48]:
import contractions          # %pip install contractions

def expand_contractions(text: str) -> str:
    return contractions.fix(text, slang=True)   # covers 'cause → because, ain't → am not …

In [69]:
CUSTOM_STOP = {
    # discourse fillers / helper verbs
    "like","yeah","uh","oh","got","wanna","get","know","tell","come",
    "go","want","look","cause","not","no", "gon", "let"
}
ALLOWED_POS = {"NOUN","ADJ","VERB","ADV"}         # keep only content words

PLURAL_MAP = {"niggas": "nigga", "bitches": "bitch", "hoes": "hoe"}

PROFANE = {"nigga", "bitch", "hoe", "shit", "fuck"}   # expand as needed
PROF_PLACEHOLDER = "<profane>"

PUNCT_TABLE = str.maketrans("", "", string.punctuation)

In [57]:
def clean_text(text: str) -> str:
    """Lower-case + expand contractions, THEN strip punctuation."""
    if not isinstance(text, str):
        return ""          
    text = expand_contractions(text.lower().replace("’", "'"))
    text = text.translate(PUNCT_TABLE)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [63]:
def tokens_for_song(doc):
    """spaCy Doc → list of clean lemmas ready for graph building."""
    tokens = []
    for tok in doc:
        if (
            tok.pos_ in ALLOWED_POS
            and tok.is_alpha
            and not tok.is_stop
        ):
            lemma = tok.lemma_.lower()
            lemma = PLURAL_MAP.get(lemma, lemma)           # collapse slang plurals
            if lemma in CUSTOM_STOP or len(lemma) == 1:
                continue
            tokens.append(lemma)
    return tokens

In [70]:
tqdm.pandas(desc="spaCy")

df["lyrics_clean"] = (
    df["lyrics_en"]
      .map(clean_text)
      .progress_apply(lambda t: tokens_for_song(nlp(t)))
)

spaCy:   0%|          | 0/758 [00:00<?, ?it/s]

In [71]:
token_len_df = df["lyrics_clean"].str.len()
token_len_df.describe()

count    758.000000
mean     123.156992
std       64.229344
min        0.000000
25%       82.000000
50%      110.500000
75%      148.000000
max      580.000000
Name: lyrics_clean, dtype: float64

In [32]:
def build_weighted_graph(token_lists, weights, window=5):
    """
    token_lists : iterable[list[str]]
    weights     : iterable[int]      (same length; usually dup_count)
    window      : size of sliding window for co-occurrence
    -------------------------------------------------------------
    returns     : NetworkX Graph with edge attribute 'weight'
    """
    G = nx.Graph()
    for tokens, w in zip(token_lists, weights):
        if not tokens or w == 0:
            continue
        for i, tok in enumerate(tokens):
            for partner in islice(tokens, i+1, i+window):
                if tok == partner:
                    continue
                # accumulate weight
                if G.has_edge(tok, partner):
                    G[tok][partner]["weight"] += w
                else:
                    G.add_edge(tok, partner, weight=w)
    return G

In [72]:
print("Building popularity-weighted graph …")
G_exposure = build_weighted_graph(
    df["lyrics_clean"],
    df["dup_count"],
    window=5
)

print(f"Done: {G_exposure.number_of_nodes():,} nodes | "
      f"{G_exposure.number_of_edges():,} edges")

Building popularity-weighted graph …
Done: 10,588 nodes | 195,729 edges


In [73]:
# What words dominate once popularity is factored in?
top_deg = sorted(G_exposure.degree, key=lambda x: x[1], reverse=True)[:20]
print("\nTop 20 words by degree:")
for w, d in top_deg:
    print(f"{w:<12} {d:>6}")


Top 20 words by degree:
bitch          2144
nigga          2060
love           1837
shit           1796
fuck           1796
time           1561
baby           1501
think          1397
feel           1350
way            1331
good           1326
need           1312
say            1204
life           1191
right          1175
leave          1162
girl           1103
man            1044
night          1008
thing          1004


In [75]:
# Export

OUT_GEXF  = DATA_DIR / f"word_graph_exposure_clean.gexf"
G_gephi = nx.relabel_nodes(G_exposure, str)
nx.write_gexf(G_gephi, OUT_GEXF)

print(f"✅  Exported → {OUT_GEXF.resolve()}")


✅  Exported → C:\Users\royic\OneDrive\Desktop\לימודים\שנה ג\סמסטר ב\טקסט כנתונים\billboard-100-lyrics-analysis\data\processed\word_graph_exposure_clean.gexf
