## Feature Engineering: Aggregate Spotify Audio Features by Artist

Apply the `AggregatedAudioGenerator` to turn track-level Spotify data into artist-level features suitable for downstream recommenders.


**this process is memory intensive**

In [1]:
import pandas as pd

from recommender_pipeline.data_loader import StandardLoader
from recommender_pipeline.feature_generators.numerical_aggregator import NumericalAggregator 
from recommender_pipeline.feature_generators.unique_counter import UniqueCounter 
from recommender_pipeline.feature_generators.top_value_extractor import TopValueExtractor 
from recommender_pipeline.feature_generators.top_k_values_combiner import TopKValuesCombiner 
from recommender_pipeline.feature_generators.flag_noscore import MissingValueCounter 
from recommender_pipeline.feature_generators.row_counter import RowCounter 
from recommender_pipeline.feature_generators.cleanup import KeepGroupFeats
from recommender_pipeline.pipeline import Pipeline

pd.set_option('display.max_columns', 50)
pd.set_option('display.precision', 3)

In [2]:
# Load the Spotify music dataset
loader = StandardLoader(folder_path="../data/spotify_musics.parquet")
tracks_df = loader.load()
tracks_df.shape

(3528311, 18)

In [3]:
# Quick look at the raw track-level data
tracks_df.head()

Unnamed: 0,artist_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,Comedy,73.0,230666,0.0,0.676,0.461,1,-6.746,0,0.143,0.032,1.01e-06,0.358,0.715,87.917,4,acoustic
1,Ben Woodward,Ghost - Acoustic,55.0,149610,0.0,0.42,0.166,1,-17.235,1,0.076,0.924,5.56e-06,0.101,0.267,77.489,4,acoustic
2,Ingrid Michaelson;ZAYN,To Begin Again,57.0,210826,0.0,0.438,0.359,0,-9.734,1,0.056,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,Kina Grannis,Can't Help Falling In Love,71.0,201933,0.0,0.266,0.06,0,-18.515,1,0.036,0.905,7.07e-05,0.132,0.143,181.74,3,acoustic
4,Chord Overstreet,Hold On,82.0,198853,0.0,0.618,0.443,2,-9.681,1,0.053,0.469,0.0,0.083,0.167,119.949,4,acoustic


In [4]:
tracks_df['artist_name'].nunique()

329401

In [5]:
#we will only keep artist with at least 1 track has a popularity score, which means this artist have been listed to in spotify
#otherwise we keep too many new artists that will be treated as noise in the recommender system
artist_pop = tracks_df.groupby("artist_name")['popularity'].min()

In [None]:
#Artists with at least one track with popularity > 0
(artist_pop[artist_pop > 0]).shape

(69055,)

In [None]:
#tracks kept
tracks_df[tracks_df['artist_name'].isin(artist_pop[artist_pop > 0].index)].shape

(1537652, 18)

In [11]:
tracks_df = tracks_df[tracks_df['artist_name'].isin(artist_pop[artist_pop > 0].index)].copy()

In [12]:
groupby_col = "artist_name"
original_features = tracks_df.columns.tolist()[1:]  # Exclude 'artist_name' from original features

In [13]:
#create feature pipeline
feature_pipeline = Pipeline(steps=[
    ('num_agg', NumericalAggregator(groupby_col=groupby_col)),
    ('unique_counter', UniqueCounter(groupby_col=groupby_col, categorical_feature_cols=['track_name','track_genre'])),
    ('row_counter', RowCounter(groupby_col=groupby_col)),
    ('top_extractor', TopValueExtractor(groupby_col=groupby_col, categorical_feature_cols=['track_genre'])),
    ('top_k_combiner', TopKValuesCombiner(groupby_col=groupby_col, k=5, categorical_feature_cols=['track_genre'])),
    ('missing_counter', MissingValueCounter(groupby_col=groupby_col, feature_cols=['popularity'])),
    ('cleanup', KeepGroupFeats(original_features = original_features, group_col=groupby_col))
])
# Use in a pipeline
transformed_df = feature_pipeline.fit_transform(tracks_df)

In [14]:
transformed_df.shape

(69055, 82)

In [15]:
for x in transformed_df.columns:
    print(x)

artist_name
ft_popularity_mean
ft_popularity_median
ft_popularity_std
ft_popularity_min
ft_popularity_max
ft_duration_ms_mean
ft_duration_ms_median
ft_duration_ms_std
ft_duration_ms_min
ft_duration_ms_max
ft_explicit_mean
ft_explicit_median
ft_explicit_std
ft_explicit_min
ft_explicit_max
ft_danceability_mean
ft_danceability_median
ft_danceability_std
ft_danceability_min
ft_danceability_max
ft_energy_mean
ft_energy_median
ft_energy_std
ft_energy_min
ft_energy_max
ft_key_mean
ft_key_median
ft_key_std
ft_key_min
ft_key_max
ft_loudness_mean
ft_loudness_median
ft_loudness_std
ft_loudness_min
ft_loudness_max
ft_mode_mean
ft_mode_median
ft_mode_std
ft_mode_min
ft_mode_max
ft_speechiness_mean
ft_speechiness_median
ft_speechiness_std
ft_speechiness_min
ft_speechiness_max
ft_acousticness_mean
ft_acousticness_median
ft_acousticness_std
ft_acousticness_min
ft_acousticness_max
ft_instrumentalness_mean
ft_instrumentalness_median
ft_instrumentalness_std
ft_instrumentalness_min
ft_instrumentalness_max

In [17]:
# Persist aggregated features for downstream steps
output_path = "../data/artist_audio_features.parquet"
transformed_df.to_parquet(output_path, index=False)
print(f"Saved aggregated artist features to {output_path} with shape {transformed_df.shape}")

Saved aggregated artist features to ../data/artist_audio_features.parquet with shape (69055, 82)
