In [2]:
import pandas as pd
import time
import pandas as pd
from spotipy import Spotify
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm import tqdm


In [114]:
df = pd.read_csv("data/final_dataset.csv")

In [101]:
df = df.drop(columns=['playlistname', 'v_audio_raw', 'a_audio_raw',
       'v_audio_norm', 'a_audio_norm', 'h_lyrics_raw', 'h_lyrics_01', 'h_lyrics_norm', 'h_track',
       'catharsis_score', 'url'])

In [102]:
df.columns

Index(['source', 'domain', 'chart', 'region', 'date', 'position', 'title',
       'artist', 'scraped_at', 'track_name', 'artists', 'album_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre', 'lyrics', 'lyrics_language', 'happiness_from_lyrics'],
      dtype='object')

In [18]:
lyrics = pd.read_csv("data/ds2.csv")

In [44]:
lyrics = lyrics.drop(columns=['id','views','features'])

In [45]:
lyrics.columns

Index(['title', 'tag', 'artist', 'year', 'lyrics'], dtype='object')

In [59]:
tracks = pd.read_csv("spotify_tracks.csv")

In [63]:
tracks.columns

Index(['track_id', 'artists', 'track_name', 'popularity', 'duration_ms',
       'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre'],
      dtype='object')

----

In [65]:
def normalize(s):
    return (
        s.str.lower()
         .str.strip()
         .str.replace(r"[^\w\s]", "", regex=True)
    )

for d in [df, lyrics, tracks]:
    if "title" in d.columns:
        d["title_norm"] = normalize(d["title"])
    if "track_name" in d.columns:
        d["title_norm"] = normalize(d["track_name"])
    if "artist" in d.columns:
        d["artist_norm"] = normalize(d["artist"])

tracks["artist_norm"] = (
    tracks["artists"]
    .str.split(";")
    .str[0]
    .str.lower()
    .str.strip()
    .str.replace(r"[^\w\s]", "", regex=True)
)

tracks["title_norm"] = (
    tracks["track_name"]
    .str.lower()
    .str.strip()
    .str.replace(r"[^\w\s]", "", regex=True)
)


In [56]:
existing_track_ids = set(df["track_id"].dropna())

In [58]:
print("lyrics:", lyrics.columns.tolist())
print("tracks:", tracks.columns.tolist())


lyrics: ['title', 'tag', 'artist', 'year', 'lyrics', 'title_norm', 'artist_norm']
tracks: ['track_id', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre', 'title_norm']


In [69]:
lyrics_with_track_id = lyrics.merge(
    tracks,
    on=["title_norm", "artist_norm"],
    how="inner"
)

In [72]:
match_rate = len(lyrics_with_track_id) / len(lyrics)
print(f"Taux de matching: {match_rate:.2%}")


Taux de matching: 0.56%


In [77]:
new_tracks = lyrics_with_track_id[
    ~lyrics_with_track_id["track_id"].isin(existing_track_ids)
]

In [79]:
new_tracks.shape

(32940, 26)

In [80]:
candidates = new_tracks.copy()

candidates = candidates[
    candidates["lyrics"].notna() &
    candidates["popularity"].notna()
]

bins = [0, 20, 40, 60, 80, 100]
labels = ["0-20", "20-40", "40-60", "60-80", "80-100"]

candidates["pop_bin"] = pd.cut(
    candidates["popularity"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

bin_weights = {
    "0-20": 0.10,
    "20-40": 0.20,
    "40-60": 0.30,
    "60-80": 0.25,
    "80-100": 0.15
}

TARGET_TOTAL = 1000

bin_targets = {
    b: int(TARGET_TOTAL * w)
    for b, w in bin_weights.items()
}


In [81]:
sampled_bins = []

for bin_label, n in bin_targets.items():
    subset = candidates[candidates["pop_bin"] == bin_label]

    if len(subset) == 0:
        continue

    sampled = subset.sample(
        n=min(len(subset), n),
        random_state=42
    )
    sampled_bins.append(sampled)

sampled_tracks = pd.concat(sampled_bins, ignore_index=True)


In [103]:
df = pd.concat([df, sampled_tracks.reindex(columns=df.columns)], ignore_index=True)

In [104]:
df.isna().sum()

source                   1000
domain                   1000
chart                    1000
region                   1402
date                     1066
position                 1000
title                       0
artist                      0
scraped_at               1000
track_name                  0
artists                     0
album_name               1000
popularity                  0
duration_ms                 0
explicit                    0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
time_signature              0
track_genre                 0
lyrics                      1
lyrics_language          1001
happiness_from_lyrics    1003
dtype: int64

In [121]:
df = (
    df
    .sort_values(by="source", na_position="last")
    .drop_duplicates(subset=["artist", "track_name"], keep="first")
)


In [88]:
df.to_csv("data/final.csv",index=False)

-----

In [None]:
sp = Spotify(
    auth_manager=SpotifyClientCredentials(
        client_id="",
        client_secret=""
    )
)


In [47]:
def search_track_id(sp, title, artist):
    query = f'track:"{title}" artist:"{artist}"'
    results = sp.search(q=query, type="track", limit=3)

    items = results["tracks"]["items"]
    if not items:
        return None

    items.sort(key=lambda x: x["popularity"], reverse=True)
    return items[0]["id"]


In [48]:
df["track_id"] = df.apply(
    lambda row: search_track_id(sp, row["title"], row["artist"]),
    axis=1
)