In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
user_data = pd.read_table('/cxldata/gle/usersha1-artmbid-artname-plays.tsv',
                          header = None, nrows = 2e7,
                          names = ['users', 'musicbrainz-artist-id', 'artist-name', 'plays'],
                          usecols = ['users', 'artist-name', 'plays'])
user_profiles = pd.read_table('/cxldata/gle/usersha1-profile.tsv',
                          header = None,
                          names = ['users', 'gender', 'age', 'country', 'signup'],
                          usecols = ['users', 'country'])

In [3]:
user_data.head()

Unnamed: 0,users,artist-name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [4]:
user_profiles.head()

Unnamed: 0,users,country
0,00000c289a1829a808ac09c00daf10bc3c4e223b,Germany
1,00001411dc427966b17297bf4d69e7e193135d89,Canada
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,Germany
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,Mexico
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,United States


In [5]:
user_data[user_data['artist-name'].isnull()]

Unnamed: 0,users,artist-name,plays
244853,039e5d61d65bbf5e6d95b07b1b3b67f7fd287a62,,18
431015,065a001be5a8a55971042077933e263d0d5cde46,,186
455721,06b17c50402d06a497cb13a0375992fd1e90b392,,3
504026,0757ac29973aab69bb31cd164c6df975bf4df9a1,,38
607282,08e102b376abe856a3d4be5ea14ad6b37395fe82,,208
638945,09587336c1c5ab850fa8216faea53cc4a03a2e0d,,2
686081,0a05c798f9979b0bb815f74284a17e8f1c6c2cb4,,36
1117649,103e539147a6952fd1769498138fab82dff47a35,,35
1155078,10d6b8b61f976e20334f279e3c6b79594b7e14d8,,9
1191415,11611e90a0ec78aee3f65dcc4b5bc586aa4c7124,,17


In [6]:
if user_data['artist-name'].isnull().sum() > 0:
    user_data = user_data.dropna(axis = 0, subset = ['artist-name'])

In [7]:
user_data[user_data['artist-name'].isnull()]

Unnamed: 0,users,artist-name,plays


In [9]:
artist_plays = (user_data.
     groupby(by = ['artist-name'])['plays'].
     sum().
     reset_index().
    rename(columns = {'plays': 'total_artist_plays'})
     [['artist-name', 'total_artist_plays']])
artist_plays.head()

Unnamed: 0,artist-name,total_artist_plays
0,04)],6
1,2,1606
2,58725ab=>,23
3,80lİ yillarin tÜrkÇe sÖzlÜ aŞk Şarkilari,70
4,amy winehouse,23


In [10]:
user_data_with_artist_plays = user_data.merge(artist_plays, left_on = 'artist-name', right_on = 'artist-name', how = 'left')
user_data_with_artist_plays.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,25651
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498


In [11]:
artist_plays['total_artist_plays'].describe()

count     292363.000
mean       12907.022
std       185981.631
min            1.000
25%           53.000
50%          208.000
75%         1048.000
max     30466827.000
Name: total_artist_plays, dtype: float64

In [11]:
artist_plays['total_artist_plays'].max()

30466827

In [12]:
artist_plays[artist_plays['total_artist_plays']==artist_plays['total_artist_plays'].max()]

Unnamed: 0,artist-name,total_artist_plays
252494,the beatles,30466827


In [12]:
artist_plays[artist_plays['total_artist_plays']==artist_plays['total_artist_plays'].max()]

Unnamed: 0,artist-name,total_artist_plays
252494,the beatles,30466827


In [13]:
artist_plays['total_artist_plays'].quantile(np.arange(.9, 1, .01)), 

(0.900     6137.800
 0.910     7409.420
 0.920     9102.040
 0.930    11474.660
 0.940    14898.000
 0.950    19964.500
 0.960    28420.120
 0.970    43541.420
 0.980    79403.560
 0.990   198483.660
 Name: total_artist_plays, dtype: float64,)

In [14]:
artist_plays['total_artist_plays'].value_counts()

1          2816
2          2724
3          2365
4          2211
5          2123
6          2036
7          1907
8          1874
9          1833
12         1825
10         1786
11         1663
16         1640
14         1638
13         1621
15         1591
20         1476
19         1474
18         1467
17         1448
22         1355
23         1330
24         1320
21         1305
25         1289
26         1265
28         1249
30         1243
29         1223
27         1205
           ... 
87972         1
73645         1
143307        1
296990        1
22660         1
18566         1
16519         1
8331          1
213149        1
458577        1
327441        1
646837        1
134607        1
443734        1
123081        1
40349         1
75178         1
1641903       1
144840        1
2244041       1
22148         1
655025        1
16009         1
7821          1
63120         1
52887         1
40605         1
36511         1
67246         1
21621         1
Name: total_artist_plays

# Taking only top artist (where no of plays are more)


In [22]:
popularity_threshold = 40000
user_data_popular_artists = user_data_with_artist_plays.query('total_artist_plays >= @popularity_threshold')
user_data_popular_artists.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,13547741


In [15]:
user_data_with_artist_plays.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,25651
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498


In [23]:
user_data_popular_artists.head(100)

Unnamed: 0,users,artist-name,plays,total_artist_plays
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,13547741
6,00000c289a1829a808ac09c00daf10bc3c4e223b,magica,545,40139
7,00000c289a1829a808ac09c00daf10bc3c4e223b,the black dahlia murder,507,1004743
8,00000c289a1829a808ac09c00daf10bc3c4e223b,the murmurs,424,58790
9,00000c289a1829a808ac09c00daf10bc3c4e223b,lunachicks,403,61171
10,00000c289a1829a808ac09c00daf10bc3c4e223b,walls of jericho,393,620899


In [20]:
user_data_with_artist_plays.query('plays>40000')

Unnamed: 0,users,artist-name,plays,total_artist_plays
27358,006261139d787c1e43b4c69d304f2772367c1005,garbage,62054,2461628
43276,00a20b9791abd8b29903a8a43e343ae93a98d9fd,lil wayne,107758,2432188
166489,0268c4ff8eba994c93fc0e49644bac7b49caa068,mindless self indulgence,43251,3172270
175680,028b91859a012251da23c3dbfd2215154a789f9f,afi,59169,3918876
191656,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,christina aguilera,135392,2680164
207045,030b70e6467476cd8b57933b04a07f0f45599d8c,hilary duff,56126,1111800
224393,034e097d2768be93762fd26f445bc096d613770a,kiss,43381,2113789
427293,0649b561f5bcc119525300890c51762f86db069a,super junior,48782,598528
494042,073689cd85d6f876b0b1123598c53194b2d21198,boa,50530,1311711
556796,082279c9db5330c25a4e0ceae275a9fc79c753c4,céline dion,86132,1172663


In [24]:
combined = user_data_popular_artists.merge(user_profiles, left_on = 'users', right_on = 'users', how = 'left')
usa_data = combined.query('country == \'United States\'')
usa_data.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays,country
156,00007a47085b9aab8af55f52ec8846ac479ac4fe,devendra banhart,456,2366807,United States
157,00007a47085b9aab8af55f52ec8846ac479ac4fe,boards of canada,407,6115545,United States
158,00007a47085b9aab8af55f52ec8846ac479ac4fe,cocorosie,386,2194862,United States
159,00007a47085b9aab8af55f52ec8846ac479ac4fe,aphex twin,213,4248296,United States
160,00007a47085b9aab8af55f52ec8846ac479ac4fe,animal collective,203,3495537,United States


In [25]:
combined.query('plays>100000')

Unnamed: 0,users,artist-name,plays,total_artist_plays,country
34568,00a20b9791abd8b29903a8a43e343ae93a98d9fd,lil wayne,107758,2432188,United States
155324,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,christina aguilera,135392,2680164,Brazil
542544,09d12dfa05a0852053a9017121034a837fa4019e,alice cooper,134993,1542185,United Kingdom
617078,0b2956b319a3ac466b0cf1a8c49fa73498d0898c,in flames,112989,11288367,Russian Federation
1159053,14ea4c6f3c2e86b4937f1158bd13d3173d780bd7,dean martin,288375,655025,United States
1298741,177653480857c3bb69b9a71b4f7166b7cd62129c,rush,100846,2518951,United States
1353498,1872585e74857e4888dfa63bd1186d210aae7681,tokio hotel,141661,952834,United States
1821162,20d54d757ff07da456dfaa26e9077f5fa12fe71a,marilyn manson,111455,6417868,Poland
1914297,228eb001a7ad5408dce7d40859e5935081518ff1,the rasmus,100080,1156417,Russian Federation
2170271,274f8ab91b73503c3a18cb5c230affa56e0a677d,u2,116025,8111215,France


In [27]:
initial_rows = usa_data.shape[0]
print('Initial dataframe shape {0}'.format(usa_data.shape))
usa_data = usa_data.drop_duplicates(['users', 'artist-name'])
current_rows = usa_data.shape[0]
print('New dataframe shape {0}'.format(usa_data.shape))
print('Removed {0} rows'.format(initial_rows - current_rows))

Initial dataframe shape (2788019, 5)
New dataframe shape (2788013, 5)
Removed 6 rows


# Reshape data into sparse matrix

In [28]:
wide_artist_data = usa_data.pivot(index = 'artist-name', columns = 'users', values = 'plays').fillna(0)
wide_artist_data_sparse = csr_matrix(wide_artist_data.values)

In [33]:
wide_artist_data.head()

users,00007a47085b9aab8af55f52ec8846ac479ac4fe,0001a57568309b287363e72dc682e9a170ba6dc2,00024b5b85c40f990c28644d53257819980bf6bb,0002dd2154072434d26e5409faa591bfb260a01e,00032c7933e0eb05f2258f1147ef81a90f2d4d6c,00041cbfdd019b5431f926133266cc4ba38219bb,000429493d9716b66b02180d208d09b5b89fbe64,000701c3c006b091990162635b36b008c504c6a7,000752c87a61bc4247f5219b4769c347c0062c8a,0008538a0f505f72fdd66af3c4c71aef8d3bdea4,...,fff58a5c95280b7af63f9c552f9159b58ae5efa3,fff694cf332ed701dccbf17f1d9595ba8ad69f22,fff69e7cb53568c732909648527a778c31befec8,fff820efe22db6c868515436de82af39e013b910,fff89b6b5332f0f38996f11c88f908a3924926fe,fff9dc65e7f2763a7e8bce8d99cc1491c2ae4c6f,fffa9294e858a7c863b5ad363c748c2330d9bd45,fffa9d62caff0f038c7a35db70f109b1bba04a1d,fffaf6f9a1a3ad8bd0dff7b48b2eb9eef030fdee,fffe8c7f952d9b960a56ed4dcb40a415d924b224
artist-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!!!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#####,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(+44),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(hed) planet earth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*nsync,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
wide_artist_data.shape

(9127, 66913)

In [43]:
wide_artist_data_sparse[1].data

array([ 305.,   31.,   66.,  251.,   21.,   47.,   27.,   32.])

In [35]:
wide_artist_data_sparse

<9127x66913 sparse matrix of type '<type 'numpy.float64'>'
	with 2788013 stored elements in Compressed Sparse Row format>

In [36]:
wide_artist_data_sparse.data

array([ 113.,  109.,  127., ...,  126.,  428.,  391.])

# Apply KNN model

In [46]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')


In [47]:
model_knn

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [48]:
model_knn.fit(wide_artist_data_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [25]:
model_knn

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [52]:
query_index = np.random.choice(wide_artist_data.shape[0])


In [53]:
query_index

8551

In [54]:
wide_artist_data.loc['the beatles']

users
00007a47085b9aab8af55f52ec8846ac479ac4fe      0.000
0001a57568309b287363e72dc682e9a170ba6dc2    167.000
00024b5b85c40f990c28644d53257819980bf6bb      0.000
0002dd2154072434d26e5409faa591bfb260a01e      0.000
00032c7933e0eb05f2258f1147ef81a90f2d4d6c     68.000
00041cbfdd019b5431f926133266cc4ba38219bb      0.000
000429493d9716b66b02180d208d09b5b89fbe64    807.000
000701c3c006b091990162635b36b008c504c6a7      0.000
000752c87a61bc4247f5219b4769c347c0062c8a    248.000
0008538a0f505f72fdd66af3c4c71aef8d3bdea4      0.000
0008b075deee53a3a090668c7ec581e15c3d8430    225.000
000912716c36131c4d8591da475c93337e7196a7      0.000
0009fbcb5120332beefdb12af5e60957688f6765      0.000
000c17602167c89588c25f40310d7c29e2275c22      0.000
000d109e4a25299eeef77a14d6b6a81479d1ac0e     82.000
000d878a6b2c852089e1a57698dc7f7df76cde6a    615.000
000d8c54934cc3a9eab276ccb412dbf52b980a44      0.000
000f5ca9514226b8b1589f57f02bbdc839bf8727      0.000
001169ca4be3a1ed81e2f510039f3dcd663313e8      0.000
0013a2

In [57]:
wide_artist_data.iloc[4000, :].reshape(1, -1)

  if __name__ == '__main__':


array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [67]:
query_index = 7618

distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(wide_artist_data.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))
        
        

Recommendations for the beatles:

1: john lennon, with distance of 0.550101069694:
2: george harrison, with distance of 0.603719089391:
3: the rolling stones, with distance of 0.643655903479:
4: ringo starr, with distance of 0.673692060751:
5: led zeppelin, with distance of 0.674702665716:


In [60]:
distances

array([[ 0.        ,  0.61484033,  0.6557904 ,  0.67671898,  0.68284198,
         0.7227864 ]])

In [61]:
indices

array([[2893, 2724, 3704, 2905, 4972,  643]])

Recommendations for flesh field:

1: exilia, with distance of 0.614840326151:
2: indica, with distance of 0.65579039818:
3: flowing tears, with distance of 0.676718984191:
4: mankind is obsolete, with distance of 0.682841983433:
5: ashbury heights, with distance of 0.722786401013:


In [27]:
query_index

6675

In [28]:
distances

array([[ 0.        ,  0.6817363 ,  0.73388034,  0.73926189,  0.7485667 ,
         0.76342262]])

In [29]:
indices

array([[6675, 4856, 8748, 2798, 2840, 4600]])

In [77]:
def print_recommendations(query_index):
    distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations with binary play data for {0}:\n'.format(wide_artist_data.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))

In [97]:
inlist = wide_artist_data.index
#artist_name = 'the beatles'
#artist_name = 'see-saw'
#artist_name = 'pink floyd'
artist_name = 'michael jackson'

query_index = [x for x in range(len(inlist)) if inlist[x]==artist_name]
query_index

[5238]

In [98]:
print_recommendations(query_index)

Recommendations with binary play data for Index([u'michael jackson'], dtype='object', name=u'artist-name'):

1: janet jackson, with distance of 0.779977763442:
2: madonna, with distance of 0.804170677115:
3: mariah carey, with distance of 0.805428679137:
4: prince, with distance of 0.806056120552:
5: stevie wonder, with distance of 0.810470034036:


In [63]:
inlist = wide_artist_data.index

In [64]:
inlist

Index([u'!!!', u'#####', u'(+44)', u'(hed) planet earth', u'*nsync', u'*shels',
       u'+/-', u'+44', u'-m-', u'-oz-',
       ...
       u'동방신기', u'비', u'빅뱅', u'서태지', u'성시경', u'소녀시대', u'신화', u'이수영', u'이정현',
       u'이효리'],
      dtype='object', name=u'artist-name', length=9127)

# make all plays count binary

In [82]:
wide_artist_data_zero_one = wide_artist_data.apply(np.sign)
wide_artist_data_zero_one_sparse = csr_matrix(wide_artist_data_zero_one.values)

In [83]:
wide_artist_data_zero_one.head()

users,00007a47085b9aab8af55f52ec8846ac479ac4fe,0001a57568309b287363e72dc682e9a170ba6dc2,00024b5b85c40f990c28644d53257819980bf6bb,0002dd2154072434d26e5409faa591bfb260a01e,00032c7933e0eb05f2258f1147ef81a90f2d4d6c,00041cbfdd019b5431f926133266cc4ba38219bb,000429493d9716b66b02180d208d09b5b89fbe64,000701c3c006b091990162635b36b008c504c6a7,000752c87a61bc4247f5219b4769c347c0062c8a,0008538a0f505f72fdd66af3c4c71aef8d3bdea4,...,fff58a5c95280b7af63f9c552f9159b58ae5efa3,fff694cf332ed701dccbf17f1d9595ba8ad69f22,fff69e7cb53568c732909648527a778c31befec8,fff820efe22db6c868515436de82af39e013b910,fff89b6b5332f0f38996f11c88f908a3924926fe,fff9dc65e7f2763a7e8bce8d99cc1491c2ae4c6f,fffa9294e858a7c863b5ad363c748c2330d9bd45,fffa9d62caff0f038c7a35db70f109b1bba04a1d,fffaf6f9a1a3ad8bd0dff7b48b2eb9eef030fdee,fffe8c7f952d9b960a56ed4dcb40a415d924b224
artist-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!!!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#####,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(+44),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(hed) planet earth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*nsync,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
wide_artist_data_zero_one_sparse.data

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [85]:
wide_artist_data_zero_one_sparse[7].data

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1

# New KNN Model based on 0-1 values

In [86]:
from sklearn.neighbors import NearestNeighbors
model_nn_binary = NearestNeighbors(metric='cosine', algorithm='brute')
model_nn_binary.fit(wide_artist_data_zero_one_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [87]:
query_index = 4000
distances, indices = model_nn_binary.kneighbors(wide_artist_data_zero_one.iloc[query_index, :].reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations with binary play data for {0}:\n'.format(wide_artist_data_zero_one.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data_zero_one.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations with binary play data for joe budden:

1: jadakiss, with distance of 0.673887880506:
2: fabolous, with distance of 0.707922468236:
3: royce da 5'9", with distance of 0.714347393557:
4: styles p, with distance of 0.719508633398:
5: saigon, with distance of 0.720737719368:


  from ipykernel import kernelapp as app


In [88]:
def print_recommendations(query_index):
    distances, indices = model_nn_binary.kneighbors(wide_artist_data_zero_one.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations with binary play data for {0}:\n'.format(wide_artist_data_zero_one.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data_zero_one.index[indices.flatten()[i]], distances.flatten()[i]))

In [105]:
inlist = wide_artist_data.index
#artist_name = 'the beatles'
artist_name = 'see-saw'
#artist_name = 'pink floyd'
#artist_name = 'kenny g'
#artist_name = 'michael jackson'


query_index = [x for x in range(len(inlist)) if inlist[x]==artist_name]
query_index

[6800]

In [106]:
print_recommendations(query_index)

Recommendations with binary play data for Index([u'see-saw'], dtype='object', name=u'artist-name'):

1: fictionjunction yuuka, with distance of 0.796602865528:
2: ali project, with distance of 0.799280155919:
3: mitsumune shinkichi, with distance of 0.858865033593:
4: kotoko, with distance of 0.869452031172:
5: chikayo fukuda, with distance of 0.877501010612:


In [107]:
def print_recommendations_old(query_index):
    distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations with binary play data for {0}:\n'.format(wide_artist_data.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i]))

In [108]:
print_recommendations_old(query_index)

Recommendations with binary play data for Index([u'see-saw'], dtype='object', name=u'artist-name'):

1: the delgados, with distance of 0.404210117045:
2: jumbo, with distance of 0.508068565366:
3: death from above 1979, with distance of 0.570454676995:
4: dj sharpnel, with distance of 0.591499045656:
5: redalice, with distance of 0.611290498952:
