In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)



In [2]:
user_data = pd.read_table('./usersha1-artmbid-artname-plays_1.tsv',
                          header = None, nrows = 2e7,
                          names = ['users', 'musicbrainz-artist-id', 'artist-name', 'plays'],
                          usecols = ['users', 'artist-name', 'plays'])
user_profiles = pd.read_table('./usersha1-profile.tsv',
                          header = None,
                          names = ['users', 'gender', 'age', 'country', 'signup'],
                          usecols = ['users', 'country'])

In [3]:
user_data.head()

Unnamed: 0,users,artist-name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [4]:
user_profiles.head()

Unnamed: 0,users,country
0,00000c289a1829a808ac09c00daf10bc3c4e223b,Germany
1,00001411dc427966b17297bf4d69e7e193135d89,Canada
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,Germany
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,Mexico
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,United States


In [5]:
user_data[user_data['artist-name'].isnull()]

Unnamed: 0,users,artist-name,plays
244853,039e5d61d65bbf5e6d95b07b1b3b67f7fd287a62,,18


In [6]:
if user_data['artist-name'].isnull().sum() > 0:
    user_data = user_data.dropna(axis = 0, subset = ['artist-name'])

In [7]:
user_data[user_data['artist-name'].isnull()]

Unnamed: 0,users,artist-name,plays


In [8]:
user_data.groupby(by = ['artist-name']).sum()

Unnamed: 0_level_0,plays
artist-name,Unnamed: 1_level_1
!!!,6708
!deladap,193
!distain,66
#####,5681
#1,155
...,...
휘성,883
Ｄ≒ＳＩＲＥ,25
Ｓｃｈｗａｒｚ　Ｓｔｅｉｎ,217
ｃｈａｒｉｏｔｓ,17


In [9]:
artist_plays = (user_data.groupby(by = ['artist-name'])['plays'].sum().reset_index().rename(columns = {'plays': 'total_artist_plays'})[['artist-name', 'total_artist_plays']])
artist_plays.head()

Unnamed: 0,artist-name,total_artist_plays
0,!!!,6708
1,!deladap,193
2,!distain,66
3,#####,5681
4,#1,155


In [10]:
user_data_with_artist_plays = user_data.merge(artist_plays, left_on = 'artist-name', right_on = 'artist-name', how = 'left')
user_data_with_artist_plays.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,2287
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,58941
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,2498
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,6346
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,3156


In [11]:
user_data_with_artist_plays[user_data_with_artist_plays["artist-name"]=="betty blowtorch"]

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,2287
95417,0159d9ae1c8d76ab7fe3aa868a38e1da56cc06ab,betty blowtorch,42,2287
173825,0285490bb23814cf0dc18a5668261e243572d4eb,betty blowtorch,41,2287
259867,03d2e7186a6c29f281f6ff8b9bb2f7503822e7de,betty blowtorch,67,2287


In [12]:
artist_plays['total_artist_plays'].describe()

count    41054.000
mean      1409.311
std       7710.481
min          1.000
25%         48.000
50%        169.000
75%        594.000
max     490322.000
Name: total_artist_plays, dtype: float64

In [13]:
artist_plays['total_artist_plays'].max()

490322

In [14]:
artist_plays[artist_plays['total_artist_plays']==artist_plays['total_artist_plays'].max()]

Unnamed: 0,artist-name,total_artist_plays
34751,the beatles,490322


In [15]:
artist_plays['total_artist_plays'].quantile(np.arange(.9, 1, .01)), 

(0.900    2136.000
 0.910    2430.230
 0.920    2780.760
 0.930    3258.000
 0.940    3880.820
 0.950    4808.400
 0.960    6218.880
 0.970    8506.820
 0.980   12871.880
 0.990   23978.030
 Name: total_artist_plays, dtype: float64,)

In [16]:
artist_plays['total_artist_plays'].value_counts()

2        495
1        421
3        364
4        356
9        282
        ... 
24004      1
2137       1
3132       1
2179       1
4895       1
Name: total_artist_plays, Length: 5305, dtype: int64

# Taking only top artist (where no of plays are more)


In [17]:
popularity_threshold = 40000
user_data_popular_artists = user_data_with_artist_plays.query('total_artist_plays >= @popularity_threshold')
user_data_popular_artists.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,58941
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,195302
12,00000c289a1829a808ac09c00daf10bc3c4e223b,goldfrapp,361,41364
17,00000c289a1829a808ac09c00daf10bc3c4e223b,dropkick murphys,302,68292
24,00000c289a1829a808ac09c00daf10bc3c4e223b,jack johnson,227,103385


In [18]:
user_data_with_artist_plays.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,2287
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,58941
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,2498
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,6346
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,3156


In [19]:
user_data_popular_artists.head(100)

Unnamed: 0,users,artist-name,plays,total_artist_plays
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,58941
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,195302
12,00000c289a1829a808ac09c00daf10bc3c4e223b,goldfrapp,361,41364
17,00000c289a1829a808ac09c00daf10bc3c4e223b,dropkick murphys,302,68292
24,00000c289a1829a808ac09c00daf10bc3c4e223b,jack johnson,227,103385
...,...,...,...,...
455,0001399387da41d557219578fb08b12afa25ab67,sum 41,391,47913
459,0001399387da41d557219578fb08b12afa25ab67,the offspring,256,89957
461,0001399387da41d557219578fb08b12afa25ab67,linkin park,212,256137
464,0001399387da41d557219578fb08b12afa25ab67,the prodigy,139,91017


In [20]:
user_data_with_artist_plays.query('plays>40000')

Unnamed: 0,users,artist-name,plays,total_artist_plays
27358,006261139d787c1e43b4c69d304f2772367c1005,garbage,62054,103824
43276,00a20b9791abd8b29903a8a43e343ae93a98d9fd,lil wayne,107758,159731
166489,0268c4ff8eba994c93fc0e49644bac7b49caa068,mindless self indulgence,43251,77356
175680,028b91859a012251da23c3dbfd2215154a789f9f,afi,59169,111718
191656,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,christina aguilera,135392,166373
207045,030b70e6467476cd8b57933b04a07f0f45599d8c,hilary duff,56126,78315
224393,034e097d2768be93762fd26f445bc096d613770a,kiss,43381,75244


In [21]:
combined = user_data_popular_artists.merge(user_profiles, left_on = 'users', right_on = 'users', how = 'left')
usa_data = combined.query('country == \'United States\'')
usa_data.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays,country
30,00007a47085b9aab8af55f52ec8846ac479ac4fe,devendra banhart,456,43552,United States
31,00007a47085b9aab8af55f52ec8846ac479ac4fe,boards of canada,407,103576,United States
32,00007a47085b9aab8af55f52ec8846ac479ac4fe,cocorosie,386,46793,United States
33,00007a47085b9aab8af55f52ec8846ac479ac4fe,aphex twin,213,68866,United States
34,00007a47085b9aab8af55f52ec8846ac479ac4fe,animal collective,203,52983,United States


In [22]:
combined.query('plays>100000')

Unnamed: 0,users,artist-name,plays,total_artist_plays,country
9088,00a20b9791abd8b29903a8a43e343ae93a98d9fd,lil wayne,107758,159731,United States
42264,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,christina aguilera,135392,166373,Brazil


In [23]:
initial_rows = usa_data.shape[0]
print('Initial dataframe shape {0}'.format(usa_data.shape))
usa_data = usa_data.drop_duplicates(['users', 'artist-name'])
current_rows = usa_data.shape[0]
print('New dataframe shape {0}'.format(usa_data.shape))
print('Removed {0} rows'.format(initial_rows - current_rows))

Initial dataframe shape (11531, 5)
New dataframe shape (11531, 5)
Removed 0 rows


# Reshape data into sparse matrix

In [24]:
wide_artist_data = usa_data.pivot(index = 'artist-name', columns = 'users', values = 'plays').fillna(0)
wide_artist_data_sparse = csr_matrix(wide_artist_data.values)

In [25]:
wide_artist_data.head()

users,00007a47085b9aab8af55f52ec8846ac479ac4fe,0001a57568309b287363e72dc682e9a170ba6dc2,00024b5b85c40f990c28644d53257819980bf6bb,0002dd2154072434d26e5409faa591bfb260a01e,00032c7933e0eb05f2258f1147ef81a90f2d4d6c,00041cbfdd019b5431f926133266cc4ba38219bb,000429493d9716b66b02180d208d09b5b89fbe64,000701c3c006b091990162635b36b008c504c6a7,000752c87a61bc4247f5219b4769c347c0062c8a,0008538a0f505f72fdd66af3c4c71aef8d3bdea4,...,03e7e30f10eede73d2948b1731c48c0cc73b62c9,03ebf1d5da56ffe6fc42c8d55199fd739b5ab821,03ed0412bc1221318af2384206b43fc4cbdc6ac5,03edc9fcb507a530ccef8569b42880167ed073dd,03edf68c90c26e85983c6778ca5bdf2c7259b838,03ee466989003abcc4624cdc5f20d113b44a85c2,03eef9de95b32fe1619e4c620e11274d0da7a768,03ef99a4502025ea1afde886776a0e1545f0bb78,03effe6a771d9002f4000d2d838e77ba8015e6cc,03f03dc3e059cf061e158341b256a13adfbff9bb
artist-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30 seconds to mars,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[unknown],0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a perfect circle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,56.0,0.0,0.0,0.0,0.0,0.0,27.0
ac/dc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aerosmith,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
wide_artist_data.shape

(206, 975)

In [27]:
wide_artist_data_sparse[1].data

array([  43.,  191.,   57.,  264.,  182.,    3.,  273.,   20.,   82.,
        454., 1013.,   98.,  624.,  390.,  106.,  295.,   83.,  682.,
         64.,  637.,   39.,   60.])

In [28]:
wide_artist_data_sparse

<206x975 sparse matrix of type '<class 'numpy.float64'>'
	with 11531 stored elements in Compressed Sparse Row format>

In [29]:
wide_artist_data_sparse.data

array([ 75.,  47., 253., ...,  35.,  19., 721.])

# Apply KNN model

In [30]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')


In [31]:
model_knn

In [32]:
model_knn.fit(wide_artist_data_sparse)

In [33]:
model_knn

In [75]:
query_index = np.random.choice(wide_artist_data.shape[0])


In [76]:
query_index

12

In [77]:
wide_artist_data.loc['the beatles']

users
00007a47085b9aab8af55f52ec8846ac479ac4fe     0.000
0001a57568309b287363e72dc682e9a170ba6dc2   167.000
00024b5b85c40f990c28644d53257819980bf6bb     0.000
0002dd2154072434d26e5409faa591bfb260a01e     0.000
00032c7933e0eb05f2258f1147ef81a90f2d4d6c    68.000
                                             ...  
03ee466989003abcc4624cdc5f20d113b44a85c2     0.000
03eef9de95b32fe1619e4c620e11274d0da7a768     0.000
03ef99a4502025ea1afde886776a0e1545f0bb78    54.000
03effe6a771d9002f4000d2d838e77ba8015e6cc    73.000
03f03dc3e059cf061e158341b256a13adfbff9bb     0.000
Name: the beatles, Length: 975, dtype: float64

In [37]:
wide_artist_data.shape

(206, 975)

In [40]:
wide_artist_data

users,00007a47085b9aab8af55f52ec8846ac479ac4fe,0001a57568309b287363e72dc682e9a170ba6dc2,00024b5b85c40f990c28644d53257819980bf6bb,0002dd2154072434d26e5409faa591bfb260a01e,00032c7933e0eb05f2258f1147ef81a90f2d4d6c,00041cbfdd019b5431f926133266cc4ba38219bb,000429493d9716b66b02180d208d09b5b89fbe64,000701c3c006b091990162635b36b008c504c6a7,000752c87a61bc4247f5219b4769c347c0062c8a,0008538a0f505f72fdd66af3c4c71aef8d3bdea4,...,03e7e30f10eede73d2948b1731c48c0cc73b62c9,03ebf1d5da56ffe6fc42c8d55199fd739b5ab821,03ed0412bc1221318af2384206b43fc4cbdc6ac5,03edc9fcb507a530ccef8569b42880167ed073dd,03edf68c90c26e85983c6778ca5bdf2c7259b838,03ee466989003abcc4624cdc5f20d113b44a85c2,03eef9de95b32fe1619e4c620e11274d0da7a768,03ef99a4502025ea1afde886776a0e1545f0bb78,03effe6a771d9002f4000d2d838e77ba8015e6cc,03f03dc3e059cf061e158341b256a13adfbff9bb
artist-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30 seconds to mars,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
[unknown],0.000,0.000,0.000,0.000,0.000,0.000,0.000,43.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
a perfect circle,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,56.000,0.000,0.000,0.000,0.000,0.000,27.000
ac/dc,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
aerosmith,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tom waits,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,235.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,776.000,0.000
tool,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,28.000
tori amos,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,110.000,0.000,0.000,0.000,0.000,0.000,0.000
u2,0.000,0.000,0.000,0.000,0.000,0.000,251.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [48]:
# wide_artist_data.iloc[205, :].values.reshape(1, -1)

In [80]:
query_index = 205

distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 3)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(wide_artist_data.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))
        
        

Recommendations for 梶浦由記:

1: air, with distance of 0.1259610941757816:
2: jamiroquai, with distance of 0.3844924248924104:


In [81]:
len(distances)

1

In [82]:
indices

array([[205,   6,  85]], dtype=int64)

In [83]:
query_index

205

In [53]:
def print_recommendations(query_index):
    distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations with binary play data for {0}:\n'.format(wide_artist_data.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))

In [54]:
inlist = wide_artist_data.index
#artist_name = 'the beatles'
#artist_name = 'see-saw'
#artist_name = 'pink floyd'
artist_name = 'michael jackson'

query_index = [x for x in range(len(inlist)) if inlist[x]==artist_name]
query_index

[117]

In [55]:
print_recommendations(query_index)

Recommendations with binary play data for Index(['michael jackson'], dtype='object', name='artist-name'):

1: feist, with distance of 0.22824191526655957:
2: lily allen, with distance of 0.27360705204723945:
3: amy winehouse, with distance of 0.33354015877181076:
4: norah jones, with distance of 0.397111553464255:
5: nelly furtado, with distance of 0.5821825810302546:


In [56]:
inlist = wide_artist_data.index

In [57]:
inlist

Index(['30 seconds to mars', '[unknown]', 'a perfect circle', 'ac/dc',
       'aerosmith', 'afi', 'air', 'alice in chains', 'alkaline trio',
       'amon amarth',
       ...
       'the smiths', 'the strokes', 'the white stripes', 'the who',
       'thievery corporation', 'tom waits', 'tool', 'tori amos', 'u2', '梶浦由記'],
      dtype='object', name='artist-name', length=206)

# make all plays count binary

In [58]:
wide_artist_data_zero_one = wide_artist_data.apply(np.sign)
wide_artist_data_zero_one_sparse = csr_matrix(wide_artist_data_zero_one.values)

In [59]:
wide_artist_data_zero_one.head()

users,00007a47085b9aab8af55f52ec8846ac479ac4fe,0001a57568309b287363e72dc682e9a170ba6dc2,00024b5b85c40f990c28644d53257819980bf6bb,0002dd2154072434d26e5409faa591bfb260a01e,00032c7933e0eb05f2258f1147ef81a90f2d4d6c,00041cbfdd019b5431f926133266cc4ba38219bb,000429493d9716b66b02180d208d09b5b89fbe64,000701c3c006b091990162635b36b008c504c6a7,000752c87a61bc4247f5219b4769c347c0062c8a,0008538a0f505f72fdd66af3c4c71aef8d3bdea4,...,03e7e30f10eede73d2948b1731c48c0cc73b62c9,03ebf1d5da56ffe6fc42c8d55199fd739b5ab821,03ed0412bc1221318af2384206b43fc4cbdc6ac5,03edc9fcb507a530ccef8569b42880167ed073dd,03edf68c90c26e85983c6778ca5bdf2c7259b838,03ee466989003abcc4624cdc5f20d113b44a85c2,03eef9de95b32fe1619e4c620e11274d0da7a768,03ef99a4502025ea1afde886776a0e1545f0bb78,03effe6a771d9002f4000d2d838e77ba8015e6cc,03f03dc3e059cf061e158341b256a13adfbff9bb
artist-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30 seconds to mars,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
[unknown],0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a perfect circle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
ac/dc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aerosmith,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
wide_artist_data_zero_one_sparse.data

array([1., 1., 1., ..., 1., 1., 1.])

In [61]:
wide_artist_data_zero_one_sparse[7].data

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1.])

# New KNN Model based on 0-1 values

In [62]:
from sklearn.neighbors import NearestNeighbors
model_nn_binary = NearestNeighbors(metric='cosine', algorithm='brute')
model_nn_binary.fit(wide_artist_data_zero_one_sparse)

In [64]:
wide_artist_data_zero_one.shape

(206, 975)

In [66]:
query_index = 200
distances, indices = model_nn_binary.kneighbors(wide_artist_data_zero_one.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations with binary play data for {0}:\n'.format(wide_artist_data_zero_one.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data_zero_one.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations with binary play data for thievery corporation:

1: massive attack, with distance of 0.5720395074890872:
2: röyksopp, with distance of 0.7076473268976571:
3: air, with distance of 0.7226499018873855:
4: portishead, with distance of 0.7666955869088815:
5: infected mushroom, with distance of 0.7851655377881701:


In [67]:
def print_recommendations(query_index):
    distances, indices = model_nn_binary.kneighbors(wide_artist_data_zero_one.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations with binary play data for {0}:\n'.format(wide_artist_data_zero_one.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data_zero_one.index[indices.flatten()[i]], distances.flatten()[i]))

In [71]:
inlist = wide_artist_data.index
artist_name = 'the beatles'
# artist_name = 'see-saw'
#artist_name = 'pink floyd'
#artist_name = 'kenny g'
#artist_name = 'michael jackson'


query_index = [x for x in range(len(inlist)) if inlist[x]==artist_name]
query_index

[178]

In [72]:
print_recommendations(query_index)

Recommendations with binary play data for Index(['the beatles'], dtype='object', name='artist-name'):

1: bob dylan, with distance of 0.5285423309881541:
2: radiohead, with distance of 0.588202469795711:
3: david bowie, with distance of 0.5883668640049056:
4: the rolling stones, with distance of 0.5930428628242805:
5: the white stripes, with distance of 0.6224856082734943:


In [73]:
def print_recommendations_old(query_index):
    distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations with binary play data for {0}:\n'.format(wide_artist_data.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))

In [74]:
print_recommendations_old(query_index) 

Recommendations with binary play data for Index(['the beatles'], dtype='object', name='artist-name'):

1: the rolling stones, with distance of 0.4495904435822202:
2: the who, with distance of 0.5397975938991708:
3: radiohead, with distance of 0.8316611320250029:
4: bob dylan, with distance of 0.835397327551727:
5: led zeppelin, with distance of 0.8384220472928784:
