# Imports

including popularity columns greatly reduces ACU from 0.7 to 0.3

In [1]:
from IPython.core.interactiveshell import InteractiveShell
#multiple output in one cell
InteractiveShell.ast_node_interactivity = "all"

from recommender_pipeline.data_loader import StandardLoader

import pandas as pd

# Load Data

In [2]:
spotify_df = StandardLoader("../data/artist_audio_features.parquet").load()
user_df = StandardLoader("../data/user_artists.dat").load()

#this is an output from the artist name matching, this is a mapper to global IDs we created
artist_global_map = StandardLoader("../data/artist_global_map.pkl").load()

In [3]:
print(f"Artists from Spotify: {spotify_df.shape}")
print(f"Interactions: {user_df.shape}")
print(f"Artists with interactions (may or may not be in Spotify): {user_df['artistID'].nunique()}")

Artists from Spotify: (329401, 82)
Interactions: (92834, 3)
Artists with interactions (may or may not be in Spotify): 17632


In [4]:
#note this is entire spotify data
pd.Series(artist_global_map.get_mapping_summary()).rename(index={"primary_artists":"LastFM Artists", 
"secondary_artists":"Spotify Artists", 
"total_unified_ids":"All Artists",
"unmatched_primary":"Unmatched LastFM Artists",
"unmatched_secondary":"Unmatched Spotify Artists"})

All Artists                       332381
LastFM Artists                     17632
Spotify Artists                   329401
matched_artists                    14652
Unmatched LastFM Artists            2980
Unmatched Spotify Artists         314749
id_range                     (1, 332381)
dtype: object

In [5]:
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329401 entries, 0 to 329400
Data columns (total 82 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   artist_name                    329401 non-null  object 
 1   ft_popularity_mean             83740 non-null   float64
 2   ft_popularity_median           83740 non-null   float64
 3   ft_popularity_std              57877 non-null   float64
 4   ft_popularity_min              83740 non-null   float64
 5   ft_popularity_max              83740 non-null   float64
 6   ft_duration_ms_mean            329401 non-null  float64
 7   ft_duration_ms_median          329401 non-null  float64
 8   ft_duration_ms_std             184130 non-null  float64
 9   ft_duration_ms_min             329401 non-null  int64  
 10  ft_duration_ms_max             329401 non-null  int64  
 11  ft_explicit_mean               31437 non-null   float64
 12  ft_explicit_median            

## Put IDs

In [6]:
spotify_df['global_id'] = artist_global_map.transform_dataframe(spotify_df[['artist_name']], 'artist_name','secondary', 'global_id')['global_id']
user_df['global_id'] = artist_global_map.transform_dataframe(user_df[['artistID']], 'artistID','primary', 'global_id')['global_id']
spotify_df['in_interactions'] = spotify_df['global_id'].isin(user_df['global_id'])


In [9]:
spotify_df[spotify_df['in_interactions']].shape

(14652, 84)

In [12]:
#keep only interacted artists or with popularity score for now
keepme = spotify_df[spotify_df['in_interactions'] | (spotify_df['ft_popularity_median'] > 0)]
keepme.shape

(82757, 84)

In [13]:
spotify_df = keepme.copy()

In [14]:
#drop explicit cols, too many nulls
explicit_cols = [col for col in spotify_df.columns if 'explicit' in col]
spotify_df.drop(columns=explicit_cols, inplace=True)

In [136]:
# #drop mean min max std, keep only median
# drop_cols = [col for col in spotify_df.columns if any(suffix in col for suffix in ['_mean', '_min', '_max', '_std'])]
# spotify_df.drop(columns=drop_cols, inplace=True)

In [15]:
#drop popularity score -- popularity reduces AUC from 0.7 to 0.3
drop_cols = [col for col in spotify_df.columns if "popularity" in col]
drop_cols
spotify_df.drop(columns=drop_cols, inplace=True)

['ft_popularity_mean',
 'ft_popularity_median',
 'ft_popularity_std',
 'ft_popularity_min',
 'ft_popularity_max',
 'ft_popularity_missing_count']

In [None]:
spotify_df['ft_track_genre_top'].nunique() #unique genres
spotify_df['ft_track_genre_top'].fillna("unknown").value_counts().head(20)
spotify_df['ft_track_genre_top'].fillna("unknown").value_counts().describe()

114

ft_track_genre_top
unknown        5748
jazz           2306
french         1952
folk           1850
funk           1763
german         1703
gospel         1698
k-pop          1634
chill          1633
hip-hop        1597
spanish        1487
emo            1462
swedish        1449
dub            1414
deep-house     1377
blues          1282
hardcore       1247
disco          1243
black-metal    1224
classical      1196
Name: count, dtype: int64

count     115.000000
mean      719.626087
std       696.533503
min        20.000000
25%       238.500000
50%       610.000000
75%       986.500000
max      5748.000000
Name: count, dtype: float64

In [20]:
spotify_df.ft_track_genre_top.fillna("missing").value_counts()

ft_track_genre_top
missing        5748
jazz           2306
french         1952
folk           1850
funk           1763
               ... 
iranian          45
grunge           43
romance          43
alternative      42
songwriter       20
Name: count, Length: 115, dtype: int64

In [21]:
# drop artists without genre
spotify_df = spotify_df[spotify_df['ft_track_genre_top'].notna()].copy()
spotify_df.shape

(77009, 73)

In [23]:
# One hot encode track_genre_top
genre_dummies = pd.get_dummies(spotify_df['ft_track_genre_top'], prefix='genre').astype(int)
spotify_df = pd.concat([spotify_df.drop(columns=['ft_track_genre_top']), genre_dummies], axis=1)
spotify_df.head()
spotify_df.shape

Unnamed: 0,artist_name,ft_duration_ms_mean,ft_duration_ms_median,ft_duration_ms_std,ft_duration_ms_min,ft_duration_ms_max,ft_danceability_mean,ft_danceability_median,ft_danceability_std,ft_danceability_min,...,genre_spanish,genre_study,genre_swedish,genre_synth-pop,genre_tango,genre_techno,genre_trance,genre_trip-hop,genre_turkish,genre_world-music
0,Gen Hoshino,230111.78022,230666.0,41289.983216,121513,324000,0.64733,0.654,0.122617,0.352,...,0,0,0,0,0,0,0,0,0,0
1,Ben Woodward,197994.026316,198436.5,39352.33714,121783,376573,0.599289,0.6115,0.133858,0.276,...,0,0,0,0,0,0,0,0,0,0
2,Ingrid Michaelson;ZAYN,210826.0,210826.0,,210826,210826,0.438,0.438,,0.438,...,0,0,0,0,0,0,0,0,0,0
3,Kina Grannis,197084.527586,202467.5,38929.179067,72760,311613,0.544652,0.5555,0.128464,0.24,...,0,0,0,0,0,0,0,0,0,0
4,Chord Overstreet,214753.6,204211.0,30077.400932,156000,281654,0.584844,0.59,0.100533,0.3,...,0,0,0,0,0,0,0,0,0,0


(77009, 186)

In [24]:
for col in spotify_df.columns:
    print(col)

artist_name
ft_duration_ms_mean
ft_duration_ms_median
ft_duration_ms_std
ft_duration_ms_min
ft_duration_ms_max
ft_danceability_mean
ft_danceability_median
ft_danceability_std
ft_danceability_min
ft_danceability_max
ft_energy_mean
ft_energy_median
ft_energy_std
ft_energy_min
ft_energy_max
ft_key_mean
ft_key_median
ft_key_std
ft_key_min
ft_key_max
ft_loudness_mean
ft_loudness_median
ft_loudness_std
ft_loudness_min
ft_loudness_max
ft_mode_mean
ft_mode_median
ft_mode_std
ft_mode_min
ft_mode_max
ft_speechiness_mean
ft_speechiness_median
ft_speechiness_std
ft_speechiness_min
ft_speechiness_max
ft_acousticness_mean
ft_acousticness_median
ft_acousticness_std
ft_acousticness_min
ft_acousticness_max
ft_instrumentalness_mean
ft_instrumentalness_median
ft_instrumentalness_std
ft_instrumentalness_min
ft_instrumentalness_max
ft_liveness_mean
ft_liveness_median
ft_liveness_std
ft_liveness_min
ft_liveness_max
ft_valence_mean
ft_valence_median
ft_valence_std
ft_valence_min
ft_valence_max
ft_tempo_mean


In [25]:
spotify_df[spotify_df['artist_name']=="Taylor Swift"].iloc[0]

artist_name               Taylor Swift
ft_duration_ms_mean      243165.794118
ft_duration_ms_median         236339.5
ft_duration_ms_std        51439.134933
ft_duration_ms_min               83253
                             ...      
genre_techno                         0
genre_trance                         0
genre_trip-hop                       0
genre_turkish                        0
genre_world-music                    0
Name: 23929, Length: 186, dtype: object

In [26]:
#drop new categorical column ft_track_genre_top_5_combined
#this should be processed using embedding TODO
spotify_df.drop(columns=['ft_track_genre_top_5_combined'], inplace=True)

In [32]:
#aritsts in interactions must have spotify features
user_df[user_df['global_id'].isin(spotify_df['global_id'])].shape
user_df.shape

(77469, 4)

(92834, 4)

In [35]:
#not a lot of users to drop
user_df[user_df['global_id'].isin(spotify_df['global_id'])].userID.nunique()
user_df.userID.nunique()

1883

1892

In [36]:
user_df['global_id'].nunique()

17632

In [37]:
user_df[user_df['weight']>100].agg({'userID':'nunique','global_id':'nunique'})

userID        1799
global_id    13438
dtype: int64

In [38]:
user_df.agg({'userID':'nunique','global_id':'nunique'})

userID        1892
global_id    17632
dtype: int64

In [39]:
user_df = user_df[user_df['global_id'].isin(spotify_df['global_id'])].copy()

In [44]:
feature_list = [x for x in spotify_df.columns if x not in ['artist_name','global_id','in_interactions']]
for x in feature_list:
    print(x)

ft_duration_ms_mean
ft_duration_ms_median
ft_duration_ms_std
ft_duration_ms_min
ft_duration_ms_max
ft_danceability_mean
ft_danceability_median
ft_danceability_std
ft_danceability_min
ft_danceability_max
ft_energy_mean
ft_energy_median
ft_energy_std
ft_energy_min
ft_energy_max
ft_key_mean
ft_key_median
ft_key_std
ft_key_min
ft_key_max
ft_loudness_mean
ft_loudness_median
ft_loudness_std
ft_loudness_min
ft_loudness_max
ft_mode_mean
ft_mode_median
ft_mode_std
ft_mode_min
ft_mode_max
ft_speechiness_mean
ft_speechiness_median
ft_speechiness_std
ft_speechiness_min
ft_speechiness_max
ft_acousticness_mean
ft_acousticness_median
ft_acousticness_std
ft_acousticness_min
ft_acousticness_max
ft_instrumentalness_mean
ft_instrumentalness_median
ft_instrumentalness_std
ft_instrumentalness_min
ft_instrumentalness_max
ft_liveness_mean
ft_liveness_median
ft_liveness_std
ft_liveness_min
ft_liveness_max
ft_valence_mean
ft_valence_median
ft_valence_std
ft_valence_min
ft_valence_max
ft_tempo_mean
ft_tempo_med

In [154]:
spotify_df[feature_list].info()

<class 'pandas.core.frame.DataFrame'>
Index: 14652 entries, 1 to 329380
Columns: 176 entries, duration_ms_mean to genre_world-music
dtypes: float64(57), int64(119)
memory usage: 19.8 MB


# Build Matrix

In [45]:
import importlib
import recommender_pipeline.matrix_builder
importlib.reload(recommender_pipeline.matrix_builder)
from recommender_pipeline.matrix_builder import DatasetBuilder



<module 'recommender_pipeline.matrix_builder' from '/Users/cherrylchico/Desktop/BSE/Computing for Data Science/musician-recommender/src/recommender_pipeline/matrix_builder.py'>

In [46]:
# Build matrix
builder = DatasetBuilder(item_identity_features=False)

#creates the internal index
builder.fit(interactions_df=user_df[['userID', 'global_id', 'weight']],
    item_features_df=spotify_df[['global_id']+feature_list])

# Build matrices
interactions, weights, user_features, item_features = builder.build_matrices(normalize_features=True)

In [47]:
interactions.shape
weights.shape
item_features.shape

(1883, 77009)

(1883, 77009)

(77009, 183)

In [48]:
#save sparse matrix
import scipy.sparse
scipy.sparse.save_npz("../data/interactions.npz", interactions)
scipy.sparse.save_npz("../data/weights.npz", weights)
scipy.sparse.save_npz("../data/item_features.npz", item_features)

# Model

In [49]:
#read sparse datasets
import scipy.sparse
interactions = scipy.sparse.load_npz("../data/interactions.npz")
weights = scipy.sparse.load_npz("../data/weights.npz")
item_features = scipy.sparse.load_npz("../data/item_features.npz")

In [50]:
interactions.shape
weights.shape
item_features.shape

(1883, 77009)

(1883, 77009)

(77009, 183)

In [51]:
import importlib
import recommender_pipeline.models.cross_validation 
importlib.reload(recommender_pipeline.models.cross_validation )
from recommender_pipeline.models.cross_validation  import random_train_test_split

<module 'recommender_pipeline.models.cross_validation' from '/Users/cherrylchico/Desktop/BSE/Computing for Data Science/musician-recommender/src/recommender_pipeline/models/cross_validation.py'>

In [52]:
random_state = 42
train_interactions, test_interactions, train_weights, test_weights, train_item_features, test_item_features = random_train_test_split(interactions, weights=weights, item_features=item_features, test_percentage=0.2, random_state=random_state)

In [53]:
import importlib
import recommender_pipeline.models.lightfm_recommender
importlib.reload(recommender_pipeline.models.lightfm_recommender)
from recommender_pipeline.models.lightfm_recommender import LightFMRecommender

<module 'recommender_pipeline.models.lightfm_recommender' from '/Users/cherrylchico/Desktop/BSE/Computing for Data Science/musician-recommender/src/recommender_pipeline/models/lightfm_recommender.py'>

In [None]:
# Use our custom LightFMRecommender instead
#warp > bpr > logistic in terms of metrics
lightfm_params = {'no_components': 30, 'loss': 'warp', 'random_state': 42}

model = LightFMRecommender(lightfm_params=lightfm_params, epochs=20)
model.fit(train_interactions, item_features=item_features, sample_weight=train_weights)

0,1,2
,lightfm_params,"{'epsilon': 1e-06, 'item_alpha': 0.0, 'k': 5, 'learning_rate': 0.05, ...}"
,epochs,20
,num_threads,1
,model_path,'lightfm_basemodel.pkl'
,artist_mapping,


In [55]:
# Evaluate using LightFM's built-in metrics, not yet our custom evaluator
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

In [56]:
k=20
prec = precision_at_k(
    model.model,  # Access the underlying LightFM model
    test_interactions, 
    train_interactions=train_interactions,
    item_features=item_features,
    k=k
).mean()
prec

np.float32(0.0057012844)

In [57]:
# Recall@k - use consistent item_features  
rec = recall_at_k(
    model.model,  # Access the underlying LightFM model
    test_interactions,
    train_interactions=train_interactions, 
    item_features=item_features,  # Use same features as training
    k=k
).mean()
rec

np.float64(0.013200191503504264)

In [58]:
auc = auc_score(
    model.model,  # Access the underlying LightFM model
    test_interactions,
    train_interactions=train_interactions,
    item_features=item_features  # Use same features as training
).mean()
auc

np.float32(0.90130866)