## Feature Engineering: Aggregate Spotify Audio Features by Artist

Apply the `AggregatedAudioGenerator` to turn track-level Spotify data into artist-level features suitable for downstream recommenders.

In [1]:
import pandas as pd

from recommender_pipeline.data_loaders.data_loader import StandardLoader
from recommender_pipeline.feature_generators.aggregated_audio_features import AggregatedAudioGenerator

pd.set_option('display.max_columns', 50)
pd.set_option('display.precision', 3)

In [2]:
# Load the Spotify music dataset
loader = StandardLoader(folder_path="../data/artists_spotify_matched_raw.parquet")
tracks_df = loader.load()
tracks_df.shape

(1108570, 20)

In [3]:
# Quick look at the raw track-level data
tracks_df.head()

Unnamed: 0,artist_id,name,artist_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,2,Diary of Dreams,Diary Of Dreams,Giftraum,14.0,216440,,0.727,0.528,0,-7.358,0,0.035,0.529,3.33e-05,0.101,0.139,134.069,4,industrial
1,2,Diary of Dreams,Diary Of Dreams,Undividable - Dcii E-Mix Edit,8.0,270707,,0.688,0.95,0,-6.571,0,0.028,0.032,0.217,0.107,0.831,126.054,4,industrial
2,2,Diary of Dreams,Diary Of Dreams,The Luxury of Insanity,19.0,357413,,0.627,0.896,8,-6.739,0,0.035,0.18,0.746,0.146,0.366,120.018,4,industrial
3,2,Diary of Dreams,Diary Of Dreams,A Day in December,16.0,252360,,0.596,0.926,7,-4.927,0,0.05,0.315,0.000521,0.13,0.32,74.986,4,industrial
4,2,Diary of Dreams,Diary Of Dreams,Dream of a Ghost,16.0,337493,,0.562,0.823,5,-8.552,1,0.031,0.369,0.504,0.079,0.513,127.97,4,industrial


In [4]:
tracks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108570 entries, 0 to 1108569
Data columns (total 20 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   artist_id         1108570 non-null  int64  
 1   name              1108570 non-null  object 
 2   artist_name       1108570 non-null  object 
 3   track_name        1108568 non-null  object 
 4   popularity        422056 non-null   float64
 5   duration_ms       1108570 non-null  int64  
 6   explicit          31011 non-null    float64
 7   danceability      1108570 non-null  float64
 8   energy            1108570 non-null  float64
 9   key               1108570 non-null  int64  
 10  loudness          1108570 non-null  float64
 11  mode              1108570 non-null  int64  
 12  speechiness       1108570 non-null  float64
 13  acousticness      1108570 non-null  float64
 14  instrumentalness  1108570 non-null  float64
 15  liveness          1108570 non-null  float64
 16  

In [5]:
# Aggregate audio features per artist
agg_gen = AggregatedAudioGenerator()
artist_features = agg_gen.fit_transform(tracks_df.drop(columns=['artist_id']))
artist_features.head()

Unnamed: 0,artist_name,popularity_mean,popularity_median,popularity_std,popularity_min,popularity_max,duration_ms_mean,duration_ms_median,duration_ms_std,duration_ms_min,duration_ms_max,explicit_mean,explicit_median,explicit_std,explicit_min,explicit_max,danceability_mean,danceability_median,danceability_std,danceability_min,danceability_max,energy_mean,energy_median,energy_std,energy_min,...,instrumentalness_min,instrumentalness_max,liveness_mean,liveness_median,liveness_std,liveness_min,liveness_max,valence_mean,valence_median,valence_std,valence_min,valence_max,tempo_mean,tempo_median,tempo_std,tempo_min,tempo_max,time_signature_mean,time_signature_median,time_signature_std,time_signature_min,time_signature_max,track_count,track_genre_top,track_genre_n_unique
0,Superior,,,,,,218250.0,218250.0,,218250,218250,,,,,,0.75,0.75,,0.75,0.75,0.917,0.917,,0.917,...,0.0,0.0,0.126,0.126,,0.126,0.126,0.604,0.604,,0.604,0.604,95.039,95.039,,95.039,95.039,4.0,4.0,,4,4,1,,0
1,!!!,21.795,22.0,9.698,5.0,43.0,302480.984,281557.5,100924.228,48116,719320,,,,,,0.738,0.754,0.098,0.449,0.934,0.791,0.811,0.111,0.347,...,0.0,0.925,0.174,0.113,0.152,0.037,0.942,0.659,0.69,0.217,0.101,0.97,119.392,120.013,6.707,94.331,135.022,4.0,4.0,0.0,4,4,640,electronic,5
2,!Dela Dap,,,,,,277749.056,265180.0,62188.913,176758,462776,,,,,,0.777,0.776,0.098,0.578,0.97,0.751,0.732,0.113,0.502,...,0.0,0.811,0.165,0.113,0.118,0.02,0.445,0.685,0.671,0.213,0.148,0.972,132.084,125.031,27.319,93.016,202.025,4.0,4.0,0.0,4,4,36,,0
3,!Distain,,,,,,279304.0,279304.0,69751.841,229982,328626,,,,,,0.613,0.613,0.039,0.585,0.64,0.885,0.885,0.01,0.878,...,6.07e-05,0.007,0.117,0.117,0.071,0.067,0.167,0.65,0.65,0.115,0.569,0.731,117.854,117.854,22.432,101.992,133.716,4.0,4.0,0.0,4,4,2,,0
4,"""DEMONS""",0.273,0.0,0.876,0.0,3.0,164122.848,159987.0,50863.982,100187,358240,,,,,,0.219,0.217,0.083,0.081,0.394,0.956,0.962,0.038,0.793,...,0.0,0.935,0.193,0.151,0.127,0.034,0.486,0.42,0.406,0.193,0.04,0.855,131.764,141.408,40.207,71.104,187.523,4.0,4.0,0.0,4,4,33,garage,1


In [6]:
# Inspect the aggregated artist-level features
artist_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14634 entries, 0 to 14633
Data columns (total 79 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   artist_name              14634 non-null  object 
 1   popularity_mean          8949 non-null   float64
 2   popularity_median        8949 non-null   float64
 3   popularity_std           8169 non-null   float64
 4   popularity_min           8949 non-null   float64
 5   popularity_max           8949 non-null   float64
 6   duration_ms_mean         14634 non-null  float64
 7   duration_ms_median       14634 non-null  float64
 8   duration_ms_std          13159 non-null  float64
 9   duration_ms_min          14634 non-null  int64  
 10  duration_ms_max          14634 non-null  int64  
 11  explicit_mean            3404 non-null   float64
 12  explicit_median          3404 non-null   float64
 13  explicit_std             2562 non-null   float64
 14  explicit_min          

In [7]:
# Artists with the most tracks in the dataset
artist_features.sort_values('track_count', ascending=False).head(10)

Unnamed: 0,artist_name,popularity_mean,popularity_median,popularity_std,popularity_min,popularity_max,duration_ms_mean,duration_ms_median,duration_ms_std,duration_ms_min,duration_ms_max,explicit_mean,explicit_median,explicit_std,explicit_min,explicit_max,danceability_mean,danceability_median,danceability_std,danceability_min,danceability_max,energy_mean,energy_median,energy_std,energy_min,...,instrumentalness_min,instrumentalness_max,liveness_mean,liveness_median,liveness_std,liveness_min,liveness_max,valence_mean,valence_median,valence_std,valence_min,valence_max,tempo_mean,tempo_median,tempo_std,tempo_min,tempo_max,time_signature_mean,time_signature_median,time_signature_std,time_signature_min,time_signature_max,track_count,track_genre_top,track_genre_n_unique
14103,Wolfgang Amadeus Mozart,15.827,15.0,13.316,0.0,60.0,352867.415,331800.0,185200.0,15120,2065066,0.0,0.0,0.0,0.0,0.0,0.344,0.35,0.121,0.0,0.922,0.115,0.098,0.088,0.000968,...,0.0,0.994,0.159,0.115,0.121,0.027,0.96,0.291,0.261,0.199,0.0,0.97,110.729,111.305,28.66,0.0,203.426,3.735,4.0,0.609,0,5,12684,classical,1
6403,Johann Sebastian Bach,17.728,16.0,9.343,0.0,64.0,226448.871,202506.0,139900.0,11733,1358751,,,,,,0.338,0.336,0.14,0.0,0.875,0.163,0.134,0.126,0.000583,...,0.0,0.999,0.15,0.119,0.096,0.035,0.961,0.462,0.428,0.295,0.0,0.988,104.479,100.218,29.072,0.0,219.986,3.667,4.0,0.733,0,5,7541,german,2
4814,Frédéric Chopin,23.393,21.0,11.226,0.0,66.0,281184.538,255004.0,190100.0,23614,2298940,,,,,,0.321,0.312,0.099,0.0,0.84,0.087,0.052,0.092,0.000667,...,1.5e-06,0.984,0.119,0.101,0.087,0.042,0.85,0.161,0.102,0.156,0.0,0.973,92.99,81.242,28.998,0.0,209.974,3.621,4.0,0.784,0,5,7440,classical,1
7724,Ludwig van Beethoven,18.12,16.0,10.562,0.0,69.0,467932.13,430980.0,269400.0,15040,3032093,,,,,,0.319,0.321,0.108,0.0,0.781,0.112,0.092,0.094,0.000546,...,0.0,0.988,0.141,0.106,0.113,0.035,0.937,0.206,0.162,0.159,0.0,0.979,107.186,102.339,30.117,0.0,210.659,3.745,4.0,0.624,0,5,5130,classical,2
5207,Grateful Dead,13.297,12.0,6.419,2.0,63.0,458315.165,400200.0,266000.0,22866,2791933,0.0,0.0,0.0,0.0,0.0,0.541,0.535,0.109,0.122,0.879,0.497,0.502,0.166,0.00269,...,0.0,0.974,0.425,0.346,0.29,0.029,0.999,0.608,0.637,0.195,0.032,0.976,122.066,119.394,26.938,53.678,236.799,3.875,4.0,0.485,1,5,4577,psych-rock,2
5073,Glee Cast,31.278,31.0,9.892,4.0,59.0,205889.47,208993.0,43410.0,69466,356133,0.0,0.0,0.0,0.0,0.0,0.55,0.571,0.153,0.105,0.98,0.656,0.715,0.225,0.0396,...,0.0,0.295,0.2,0.132,0.168,0.014,0.972,0.509,0.5,0.238,0.036,0.979,123.815,123.071,29.303,52.63,219.771,3.867,4.0,0.484,1,5,3108,club,1
10042,Pyotr Ilyich Tchaikovsky,25.115,23.0,10.548,0.0,59.0,373749.004,277906.0,306300.0,11093,2155011,,,,,,0.291,0.272,0.135,0.0,0.812,0.147,0.115,0.126,0.00137,...,0.0,0.992,0.163,0.114,0.129,0.017,0.947,0.23,0.136,0.227,0.0,0.987,105.361,98.851,30.846,0.0,205.804,3.732,4.0,0.608,0,5,2949,classical,1
4144,Elvis Presley,20.315,14.0,16.266,0.0,80.0,173919.059,161453.5,68820.0,4826,881481,0.0,0.0,0.0,0.0,0.0,0.477,0.475,0.135,0.0,0.859,0.493,0.49,0.25,0.0059,...,0.0,0.962,0.362,0.238,0.288,0.0,0.985,0.596,0.613,0.248,0.0,0.984,112.639,106.378,29.23,0.0,217.497,3.813,4.0,0.537,0,5,2790,rock-n-roll,3
13742,Various Artists,,,,,,477389.143,205573.0,1014000.0,2293,5751000,,,,,,0.532,0.573,0.23,0.0,0.979,0.528,0.535,0.264,0.0,...,0.0,0.995,0.2,0.13,0.179,0.0,0.993,0.491,0.507,0.305,0.0,0.991,109.712,113.457,38.54,0.0,220.714,3.66,4.0,1.015,0,5,2431,,0
13854,Vitamin String Quartet,46.667,48.0,4.163,42.0,50.0,230990.05,221684.0,59750.0,14400,670026,,,,,,0.484,0.5,0.186,0.0,0.976,0.293,0.282,0.114,0.00394,...,0.0,0.981,0.164,0.124,0.111,0.031,0.961,0.44,0.422,0.229,0.0,0.976,120.899,119.935,31.927,0.0,215.669,3.879,4.0,0.491,0,5,2343,pop,1


In [8]:
# Persist aggregated features for downstream steps
output_path = "../data/artist_audio_features.parquet"
artist_features.to_parquet(output_path, index=False)
print(f"Saved aggregated artist features to {output_path} with shape {artist_features.shape}")

Saved aggregated artist features to ../data/artist_audio_features.parquet with shape (14634, 79)
