In [1]:
from recommender_pipeline.artist_match.generate_matching_files import generate_unified_artist_mappings
import pandas as pd

In [2]:
#  load artists.dat data
artist_df = pd.read_csv('../data/artists.dat', sep='\t', index_col='id')\
                .reset_index()\
                    .rename(columns = {'id' : 'lastfm_id',
                                       'name' : 'artist_name'})\
                        [['lastfm_id', 'artist_name']]
display(artist_df.head())
print(artist_df.shape)
print(F"Number of unique artist names: {artist_df['artist_name'].nunique()}")

Unnamed: 0,lastfm_id,artist_name
0,1,MALICE MIZER
1,2,Diary of Dreams
2,3,Carpathian Forest
3,4,Moi dix Mois
4,5,Bella Morte


(17632, 2)
Number of unique artist names: 17632


In [3]:
spotify_df = pd.read_parquet('../data/spotify_musics.parquet')
display(spotify_df.head())
print(spotify_df.shape)
print(F"Number of unique artist names: {spotify_df['artist_name'].nunique()}")

Unnamed: 0,artist_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,Comedy,73.0,230666,0.0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,Ben Woodward,Ghost - Acoustic,55.0,149610,0.0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,Ingrid Michaelson;ZAYN,To Begin Again,57.0,210826,0.0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,Kina Grannis,Can't Help Falling In Love,71.0,201933,0.0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,Chord Overstreet,Hold On,82.0,198853,0.0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


(3528311, 18)
Number of unique artist names: 329401


In [4]:
# Generate unified mappings
lastfm_output, spotify_output = generate_unified_artist_mappings(
    lastfm_artists_df=artist_df,
    spotify_tracks_df=spotify_df,  # TRACK-LEVEL DATA
    lastfm_artist_name_col="artist_name",
    lastfm_artist_id_col="lastfm_id",
    spotify_artist_name_col="artist_name",
    score_cutoff=85,
    scorer="ratio"
)


Step 1: Extracting unique Spotify artists from track data...
  Total Spotify tracks: 3528311
  Unique Spotify artists: 329401
  LastFM artists: 17632

Step 2: Matching Spotify artists to LastFM artists...
Building lookup tables for 17632 artists...
  Created 6261 blocking keys
✓ Lookup tables ready

Matching 329401 artists...
  Step 1: Exact matching...
    ✓ Found 12669 exact matches
  Step 2: Fuzzy matching 316732 unmatched artists (parallel)...
    ✓ Found 10681 fuzzy matches

  Matching results:
    Matched artists: 13680
    Unmatched Spotify artists: 315721

Step 3: Creating matched artists dataframe...
  Available columns in matched_df: ['artist_name', 'right_lastfm_id', 'right_artist_name', 'right_matched_artist', 'right_match_score']
  Matched pairs: 13680

Step 4: Creating unified ID mappings...

  Mapping Summary:
    Total unified IDs: 333353
    LastFM artists: 17632
    Spotify artists: 329401
    Matched artists: 13680
    Unmatched LastFM: 3952
    Unmatched Spotify: 31

In [5]:
spotify_output.columns

Index(['artist_name', 'track_name', 'popularity', 'duration_ms', 'explicit',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'track_genre', 'unified_artist_id'],
      dtype='object')

In [6]:
spotify_output.head()

Unnamed: 0,artist_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,unified_artist_id
0,Gen Hoshino,Comedy,73.0,230666,0.0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,18746
1,Ben Woodward,Ghost - Acoustic,55.0,149610,0.0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,18747
2,Ingrid Michaelson;ZAYN,To Begin Again,57.0,210826,0.0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,18748
3,Kina Grannis,Can't Help Falling In Love,71.0,201933,0.0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,18749
4,Chord Overstreet,Hold On,82.0,198853,0.0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,18750


In [9]:
combined_df = lastfm_output.merge(spotify_output, on = ['unified_artist_id'], how = 'left')

In [10]:
combined_df.head()

Unnamed: 0,lastfm_artist_name,lastfm_artist_id,unified_artist_id,artist_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,MALICE MIZER,1,1,,,,,,,,...,,,,,,,,,,
1,Diary of Dreams,2,2,Diary Of Dreams,Giftraum,14.0,216440.0,,0.727,0.528,...,-7.358,0.0,0.0345,0.529,3.3e-05,0.101,0.139,134.069,4.0,industrial
2,Diary of Dreams,2,2,Diary Of Dreams,Undividable - Dcii E-Mix Edit,8.0,270707.0,,0.688,0.95,...,-6.571,0.0,0.0282,0.0322,0.217,0.107,0.831,126.054,4.0,industrial
3,Diary of Dreams,2,2,Diary Of Dreams,The Luxury of Insanity,19.0,357413.0,,0.627,0.896,...,-6.739,0.0,0.0353,0.18,0.746,0.146,0.366,120.018,4.0,industrial
4,Diary of Dreams,2,2,Diary Of Dreams,A Day in December,16.0,252360.0,,0.596,0.926,...,-4.927,0.0,0.0495,0.315,0.000521,0.13,0.32,74.986,4.0,industrial


In [13]:
combined_df.artist_name.isnull().sum(), combined_df.lastfm_artist_name.nunique()

(np.int64(3952), 17632)

3,952 clients out of 17,632 were not merged on Spotify dataset.