In [1]:
from recommender_pipeline.artist_match.artist_match import ArtistMatcher
from recommender_pipeline.artist_match.id_mapper import ArtistIDMapper
import pandas as pd

In [2]:
#  load artists.dat data
artist_df = pd.read_csv('../data/artists.dat', sep='\t', index_col='id')
display(artist_df.head())
print(artist_df.shape)
print(F"Number of unique artist names: {artist_df['name'].nunique()}")

Unnamed: 0_level_0,name,url,pictureURL
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
4,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
5,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...


(17632, 3)
Number of unique artist names: 17632


In [3]:
spotify_df = pd.read_parquet('../data/spotify_musics.parquet')
display(spotify_df.head())
print(spotify_df.shape)
print(F"Number of unique artist names: {spotify_df['artist_name'].nunique()}")

Unnamed: 0,artist_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,Comedy,73.0,230666,0.0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,Ben Woodward,Ghost - Acoustic,55.0,149610,0.0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,Ingrid Michaelson;ZAYN,To Begin Again,57.0,210826,0.0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,Kina Grannis,Can't Help Falling In Love,71.0,201933,0.0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,Chord Overstreet,Hold On,82.0,198853,0.0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


(3528311, 18)
Number of unique artist names: 329401


In [4]:
#instantiate matcher
matcher = ArtistMatcher(spotify_df[['artist_name']], "artist_name", score_cutoff=80)

In [5]:
matched_artists = matcher.match(
    artist_df.reset_index().rename(columns={"id":'artistID'})[['artistID','name']], 
    "name", 
    right_prefix="spotify_", 
    keep_unmatched=False, 
    one_to_many=False
)

In [6]:
matched_artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14652 entries, 0 to 14651
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   artistID                14652 non-null  int64 
 1   name                    14652 non-null  object
 2   spotify_artist_name     14652 non-null  object
 3   spotify_matched_artist  14652 non-null  object
 4   spotify_match_score     14652 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 572.5+ KB


In [7]:
#matched %
matched_artists.shape[0] / artist_df.shape[0]

0.830989110707804

In [8]:
matched_artists.to_parquet('../data/matched_artists.parquet', index=False)

In [4]:
matched_artists = pd.read_parquet('../data/matched_artists.parquet')
spotify_artists = spotify_df[['artist_name']].drop_duplicates().rename(columns={'artist_name':'spotify_artist_name'})
primary_artists = artist_df.reset_index().rename(columns={"id":'artistID'})[['artistID','name']]

In [6]:
id_mapper = ArtistIDMapper()
artist_global_map = id_mapper.fit(
    matched_artists_df=matched_artists[['artistID','name','spotify_artist_name']],
    primary_artists_df=primary_artists,
    secondary_artists_df=spotify_artists,
    primary_id_col='artistID',
    secondary_id_col='spotify_artist_name',
    matched_primary_col='artistID',
    matched_secondary_col='spotify_artist_name'
)

In [11]:
primary_df, secondary_df = artist_global_map.export_mappings()

In [19]:
artist_global_map.get_unified_id('Diary Of Dreams', "secondary")

1

In [20]:
artist_global_map.get_unified_id('Diary Of Dreams', "primary")

In [24]:
primary_df[primary_df['unified_id'].isin(secondary_df['unified_id'])]

Unnamed: 0,primary_id,unified_id
0,2,1
1,3,2
2,5,3
3,6,4
4,7,5
...,...,...
14647,18740,14648
14648,18741,14649
14649,18742,14650
14650,18743,14651


In [25]:
# Save the artist_global_map for later use
import pickle

# Save the mapper object
with open('../data/artist_global_map.pkl', 'wb') as f:
    pickle.dump(artist_global_map, f)

print("Artist global map saved successfully!")

Artist global map saved successfully!
