In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

play_count = pd.read_table('plays.tsv',
                        header = None, nrows = 2e7,
                        names = ['users', 'musicbrainz-artist-id', 'artist-name', 'plays'],
                        usecols = ['users', 'artist-name', 'plays'])
users = pd.read_table('usersha1-profile.tsv',
                        header = None,
                        names = ['users', 'gender', 'age', 'country', 'signup'],
                        usecols = ['users', 'country'])

play_count.head()

Unnamed: 0,users,artist-name,plays
0,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,christina aguilera,135392
1,09d12dfa05a0852053a9017121034a837fa4019e,alice cooper,134993
2,0b2956b319a3ac466b0cf1a8c49fa73498d0898c,in flames,112989
3,00a20b9791abd8b29903a8a43e343ae93a98d9fd,lil wayne,107758
4,082279c9db5330c25a4e0ceae275a9fc79c753c4,céline dion,86132


In [2]:
users.head()

Unnamed: 0,users,country
0,00000c289a1829a808ac09c00daf10bc3c4e223b,Germany
1,00001411dc427966b17297bf4d69e7e193135d89,Canada
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,Germany
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,Mexico
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,United States


In [3]:
if play_count['artist-name'].isnull().sum() > 0:
    play_count = play_count.dropna(axis = 0, subset = ['artist-name'])

total_plays = (play_count
               .groupby(by = ['artist-name'])['plays']
               .sum()
               .reset_index()
               .rename(columns = {'plays': 'total_artist_plays'})
               [['artist-name', 'total_artist_plays']]
              )

total_plays.head()


Unnamed: 0,artist-name,total_artist_plays
0,cours de la somme,9
1,oliver shanti & friends,3
2,!!!,19814
3,!5:b>@ 3070,33
4,!action pact!,143


In [4]:
total_count = play_count.merge(total_plays, left_on = 'artist-name', right_on = 'artist-name', how = 'left')
total_count.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,christina aguilera,135392,328864
1,09d12dfa05a0852053a9017121034a837fa4019e,alice cooper,134993,212932
2,0b2956b319a3ac466b0cf1a8c49fa73498d0898c,in flames,112989,814097
3,00a20b9791abd8b29903a8a43e343ae93a98d9fd,lil wayne,107758,285681
4,082279c9db5330c25a4e0ceae275a9fc79c753c4,céline dion,86132,145978


In [5]:
total_plays['total_artist_plays'].quantile(np.arange(.9, 1, .01))

0.90     3096.00
0.91     3619.16
0.92     4240.64
0.93     5076.56
0.94     6289.44
0.95     7929.00
0.96    10405.92
0.97    14858.00
0.98    23363.92
0.99    50182.20
Name: total_artist_plays, dtype: float64

In [6]:
threshold = 7929
popular_plays = total_count.query('total_artist_plays >= @threshold')
popular_plays.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,02ccf45baa7fe62f0935b8a6a64ff8869a7b0387,christina aguilera,135392,328864
1,09d12dfa05a0852053a9017121034a837fa4019e,alice cooper,134993,212932
2,0b2956b319a3ac466b0cf1a8c49fa73498d0898c,in flames,112989,814097
3,00a20b9791abd8b29903a8a43e343ae93a98d9fd,lil wayne,107758,285681
4,082279c9db5330c25a4e0ceae275a9fc79c753c4,céline dion,86132,145978


In [7]:
popular_plays_final = popular_plays.merge(users, left_on = 'users', right_on = 'users', how = 'left')
popular_plays_final = popular_plays_final.query('country == \'United States\'')
popular_plays_final.head()


Unnamed: 0,users,artist-name,plays,total_artist_plays,country
3,00a20b9791abd8b29903a8a43e343ae93a98d9fd,lil wayne,107758,285681,United States
4,082279c9db5330c25a4e0ceae275a9fc79c753c4,céline dion,86132,145978,United States
13,028b91859a012251da23c3dbfd2215154a789f9f,afi,59169,264433,United States
15,073689cd85d6f876b0b1123598c53194b2d21198,boa,50530,128850,United States
22,0033ee7378661b88b245b1f67cc622ff63a51061,the beatles,39655,1896944,United States


In [8]:
popular_plays_final = popular_plays_final.drop_duplicates(['users', 'artist-name'])  


In [12]:
final = popular_plays_final.pivot(index = 'artist-name', columns = 'users', values = 'plays').fillna(0)
final_sparse = csr_matrix(final.values)
final.head()

users,00007a47085b9aab8af55f52ec8846ac479ac4fe,0001a57568309b287363e72dc682e9a170ba6dc2,00024b5b85c40f990c28644d53257819980bf6bb,0002dd2154072434d26e5409faa591bfb260a01e,00032c7933e0eb05f2258f1147ef81a90f2d4d6c,00041cbfdd019b5431f926133266cc4ba38219bb,000429493d9716b66b02180d208d09b5b89fbe64,000701c3c006b091990162635b36b008c504c6a7,000752c87a61bc4247f5219b4769c347c0062c8a,0008538a0f505f72fdd66af3c4c71aef8d3bdea4,...,0f37e6c695cda2e4718d6cca84dcf33c70960274,0f399811d1b74e4feffe3f78582c49b04057ec55,0f3a7d6aa07b4e365810ea2553c72358cc02e5b7,0f3a85fc621f2523c399588078dee45ecb8b1cef,0f3af7d9d9a9548bb8ec9434b45dbdd5e2f52d6b,0f3b9f573cd8253782d7a88d7444410cc5f5d357,0f3d0600ff60db90796a92409f72638120f06bb7,0f3e7ef0b68439525233717bf0d37bef1e850ddc,0f3eea8cbaca298960c9b59f9c84fbebe701a270,0f407a223be88fb3390f8d82070f021738f06053
artist-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!!!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(hed) planet earth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*nsync,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...and you will know us by the trail of dead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...but alive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(final_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [14]:
from fuzzywuzzy import fuzz

def print_artist_recommendations(query_artist, artist_plays_matrix, knn_model, k):
   
    query_index = None
    ratio_tuples = []
    
    for i in artist_plays_matrix.index:
        ratio = fuzz.ratio(i.lower(), query_artist.lower())
        if ratio >= 75:
            current_query_index = artist_plays_matrix.index.tolist().index(i)
            ratio_tuples.append((i, ratio, current_query_index))
    
    try:
        query_index = max(ratio_tuples, key = lambda x: x[1])[2] # get the index of the best artist match in the data
    except:
        print('Your artist didn\'t match any artists in the data. Try again')
        return None
    
    distances, indices = knn_model.kneighbors(artist_plays_matrix.iloc[query_index, :].values.reshape(1, -1), n_neighbors = k + 1)

    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations for {0}:\n'.format(artist_plays_matrix.index[query_index]))
        else:
            print('{0}, with distance of {1}:'.format(artist_plays_matrix.index[indices.flatten()[i]], distances.flatten()[i]))

    return None



In [15]:
print_artist_recommendations('red hot chilli peppers', final, model_knn, k = 10)


Recommendations for red hot chili peppers:

jimi hendrix, with distance of 0.602056544241399:
led zeppelin, with distance of 0.634983650520133:
sublime, with distance of 0.6374926641938534:
john frusciante, with distance of 0.6799308627146026:
the john butler trio, with distance of 0.7400931002169682:
screaming trees, with distance of 0.7483257484372512:
supercar, with distance of 0.7508505046559782:
incubus, with distance of 0.7642681185552855:
biffy clyro, with distance of 0.7645879047937336:
radiohead, with distance of 0.772751735023782:


In [None]:
print_artist_recommendations('eminem', final, model_knn, k = 10)


In [None]:
print_artist_recommendations('lil wayne', final, model_knn, k = 10)


In [None]:
print_artist_recommendations('metallica', final, model_knn, k = 10)
