In [1]:
import pandas as pd
import numpy as np

In [2]:
import implicit

In [57]:
import pickle

In [3]:
from scipy.sparse import coo_matrix

In [4]:
from utils.CF_recommender_utils import recommend_by_average, calculate_hit_rate, recommend_by_user
from utils.metrics import *

In [5]:
! export MKL_NUM_THREADS=1

### 1. Load prepared data

In [6]:
train_df = pd.read_pickle('data/train_df.pickle')
test_df = pd.read_pickle('data/test_grouped.pickle')
left_out_df = pd.read_pickle('data/left_out_df.pickle')
train_lo_df = pd.read_pickle('data/train_lo_df.pickle')

In [7]:
#count number of interactions for each user and artist
train_df2 = train_df[['user_id', 'person_id']].groupby(['user_id', 'person_id']).size().reset_index(name='plays')

In [8]:
train_df2

Unnamed: 0,user_id,person_id,plays
0,1,104136,72
1,1,11467,63
2,1,11617,13
3,1,153765,32
4,1,168705,40
...,...,...,...
5914883,45174,83375,1
5914884,45174,86122,1
5914885,45174,87296,1
5914886,45174,87766,1


### 2. Prepare artists and left-out set dictionaries

In [9]:
people_df = pd.read_pickle('data/new_persons_df.pickle')

In [10]:
people_df['person_id'] = people_df['person_id'].astype(str)

In [11]:
len(people_df['person_id'].unique())

560927

In [12]:
people_dict = pd.Series(people_df['person_name'].values, index=people_df['person_id']).to_dict()

In [13]:
people_dict['145148']

'Everything+Is+Illuminated'

**************

In [14]:
left_out_dict = pd.Series(left_out_df['person_id'].values, index=left_out_df['user_id']).to_dict()

In [15]:
left_out_dict[25817]

'20735'

### 3. Collaborative filtering for implicit feedback data

In [16]:
#user and person ids to categorical
train_df2['user_id'] = train_df2['user_id'].astype("category")
train_df2['person_id'] = train_df2['person_id'].astype("category")

In [17]:
train_df2.head()

Unnamed: 0,user_id,person_id,plays
0,1,104136,72
1,1,11467,63
2,1,11617,13
3,1,153765,32
4,1,168705,40


In [18]:
#save person codes and construct useful dictionaries
person_codes = train_df2['person_id'].cat.codes
user_codes = train_df2['user_id'].cat.codes

code_person_dict = dict(zip(person_codes, train_df2['person_id']))
person_code_dict = dict(zip(train_df2['person_id'], person_codes))

code_user_dict = dict(zip(user_codes, train_df2['user_id']))
user_code_dict = dict(zip(train_df2['user_id'], user_codes))

In [19]:
# get persons ids to later use it for person vectors dictionary
code_person_dict_items = [x[1] for x in sorted(code_person_dict.items())]

In [20]:
# create a sparse matrix of artist/user/n_plays 
artist_user_matrix = coo_matrix((train_df2['plays'].astype(float), 
                                (person_codes, 
                                 user_codes)))


In [21]:
artist_user_matrix.shape

(515265, 36140)

Initialize several models with different parameters (manual hyperparameter tuning)

In [22]:
# initialize a model
model1 = implicit.als.AlternatingLeastSquares(factors=50, iterations=30)
model2 = implicit.als.AlternatingLeastSquares(factors=50, iterations=15)
model3 = implicit.als.AlternatingLeastSquares(factors=100, iterations=15)



In [23]:
model1.fit(artist_user_matrix)

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




In [24]:
model2.fit(artist_user_matrix)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [25]:
model3.fit(artist_user_matrix)

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [26]:
model1.item_factors

array([[ 2.35081022e-03,  3.35056218e-04,  7.22967205e-04, ...,
         8.23867274e-04,  4.98342235e-03, -5.32691600e-04],
       [ 9.41599574e-05, -1.55296038e-05,  1.03242608e-04, ...,
        -5.73605985e-06, -3.46800734e-05,  2.64862992e-05],
       [ 2.52074369e-06,  1.57263377e-04,  1.09625056e-04, ...,
         3.43194151e-05,  3.02627857e-04,  1.38968520e-04],
       ...,
       [ 9.29754315e-05,  2.03087082e-04,  1.89530529e-05, ...,
        -1.35096125e-04,  1.58506387e-04, -9.87966268e-05],
       [-4.26205836e-04,  4.59975825e-04, -7.64068449e-04, ...,
        -1.80378047e-04,  5.94981131e-04, -1.01228605e-03],
       [ 1.46501958e-02,  2.84134466e-02,  1.10280411e-02, ...,
         4.32204967e-03,  2.29079947e-02, -4.24244936e-04]], dtype=float32)

### 4. Calculate hit rate

Claculate hit rate using left-one-out data in order to decide which model is better

In [31]:
def calculate_hit_rate(left_out_dict, user_ids_lst, top_20_recommended_ids):
    '''
    Claculated hit rate for top 20 using left-one-out set
    '''
    hit_rate = 0
    total_users = len(user_ids_lst)
    
    for user, ids_lst in zip(user_ids_lst, top_20_recommended_ids):
        if left_out_dict[user] in ids_lst:
            hit_rate += 1
    return hit_rate/total_users

In [27]:
user_artist_matrix = artist_user_matrix.T.tocsr()

In [28]:
%%time
recs1 = list(left_out_df.apply(lambda row: recommend_by_user(model1, row, 
                                                             user_code_dict,
                                                             code_person_dict,
                                                             user_artist_matrix), axis=1))

CPU times: user 43min 44s, sys: 1.61 s, total: 43min 45s
Wall time: 11min 23s


In [32]:
hit_rate = calculate_hit_rate(left_out_dict, list(left_out_df['user_id']), recs1)
hit_rate

0.0

In [33]:
%%time
recs2 = list(left_out_df.apply(lambda row: recommend_by_user(model2, row, 
                                                             user_code_dict,
                                                             code_person_dict,
                                                             user_artist_matrix), axis=1))

CPU times: user 39min 24s, sys: 1.37 s, total: 39min 25s
Wall time: 10min 15s


In [35]:
hit_rate2 = calculate_hit_rate(left_out_dict, list(left_out_df['user_id']), recs2)
hit_rate2

0.0

In [36]:
%%time
recs3 = list(left_out_df.apply(lambda row: recommend_by_user(model3, row, 
                                                             user_code_dict,
                                                             code_person_dict,
                                                             user_artist_matrix), axis=1))

CPU times: user 52min 9s, sys: 1.37 s, total: 52min 10s
Wall time: 13min 3s


In [37]:
hit_rate3 = calculate_hit_rate(left_out_dict, list(left_out_df['user_id']), recs3)
hit_rate3

0.0

For collaborative filtering, all hit rates equal to 0. Probably it is because of the CF technique: items that were not seen by the user automatically get lower score during the SVD. When I calculate hit rate for top-20 predictions, left-out items never get into the predicted set. Thus, hit rate is not the best way to evaluate top-N recommender based on collaborative filtering with implicit feedback. However, I decided to just continue to work with one of the trained models and check scores on the test set. 

### 5. Make recommendations for test set and calculate MAPk, precision and recall

In [39]:
artists_vectors = dict(zip(code_person_dict_items, model1.item_factors))

In [40]:
artists_vectors_df = pd.DataFrame(columns=['person_id', 'vector'])
artists_vectors_df['person_id'] = code_person_dict_items
artists_vectors_df['vector'] = list(model1.item_factors)

In [41]:
artists_vectors_df.head()

Unnamed: 0,person_id,vector
0,1,"[0.0023508102, 0.00033505622, 0.0007229672, -0..."
1,10,"[9.415996e-05, -1.5529604e-05, 0.00010324261, ..."
2,1000,"[2.5207437e-06, 0.00015726338, 0.000109625056,..."
3,10000,"[0.0001956137, 0.00021028795, 0.00015630509, -..."
4,100000,"[0.01943316, 0.0308073, -0.003268252, -0.00348..."


In [42]:
%%time
top_20_ids = recommend_by_average(test_df, artists_vectors_df)

CPU times: user 3h 53min 1s, sys: 12min 21s, total: 4h 5min 22s
Wall time: 1h 42min 3s


In [46]:
y_true = test_df['persons_lst']
y_pred = top_20_ids

In [47]:
mapk_scores = []

for t, p in zip(y_true, y_pred):
    mapk_scores.append(MAPk(t,p))

In [49]:
np.mean(mapk_scores)

0.17343118128256374

In [50]:
prec = []

for t, p in zip(y_true, y_pred):
    prec.append(precision(t,p))

In [52]:
np.mean(prec)

0.12065855008301052

In [53]:
rec = []

for t, p in zip(y_true, y_pred):
    rec.append(recall(t,p))

In [54]:
np.mean(rec)

0.20614077754239887

In [55]:
scores_dict = {'mapk': mapk_scores,
               'precision': prec,
               'recall': rec}

In [58]:
a_file = open("CF_savings/scores_dict_cf.pkl", "wb")
pickle.dump(scores_dict, a_file)
a_file.close()

### 6. List top-20 most similar artists for some artists

In [59]:
#pop
people_dict['29692']

'Ariana+Grande'

In [61]:
sim_items = model1.similar_items(person_code_dict['29692'], N=21)[1:]
for item, score in sim_items:
    print(people_dict[code_person_dict[item]], score)

Iggy+Azalea+ft.+Charli+XCX 0.8334274
Jessie+J. 0.8050009
Meghan+Trainor 0.78568375
Sam+Smith 0.7657003
Nicki+Minaj 0.7652724
Katy+Perry+Ft+Juicy+J 0.763799
The+Fault+In+Our+Stars+I+Charli+XCX 0.7566084
Clean+Bandit+&+Jess+Glyne 0.740929
Miley+Cyrus+&+Cobra+Startship 0.73859847
Jason+Derulo 0.7348252
mariah+carey+feat.+nelly 0.7183532
Pitbull+feat.+T-Pain+&+Sean+Paul 0.71348876
Calvin+Harris+feat.+Florence+Welch 0.71252376
ToveLoVEVO 0.706127
Magic!+&+Zedd 0.6978192
One+Direction 0.6915098
Selena+Gomez+&+Selena 0.6847935
Rihana 0.68340987
Ella+Henderson 0.6777716
Chris+Brown+&+Trey+Songz 0.6752087


************

In [63]:
#rock
people_dict['157384']

'Freddie+Mercury'

In [64]:
sim_items = model1.similar_items(person_code_dict['157384'], N=21)[1:]
for item, score in sim_items:
    print(people_dict[code_person_dict[item]], score)

Smokie+&+Suzi+Quatro 0.17785707
Rednex 0.16732931
Blue+System 0.1670089
%D0%90%D0%BD%D0%BD%D0%B0+%D0%A1%D0%BD%D0%B0%D1%82%D0%BA%D0%B8%D0%BD%D0%B0 0.16614941
%D0%90%D1%81%D1%81%D0%BE%D1%80%D1%82%D0%B8 0.16611867
%D0%9A%D0%B0%D1%82%D1%8F+%D0%91%D1%83%D0%B6%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F 0.16594772
%D0%A0%D0%B0%D0%B4%D0%B8%D0%BE+%D0%9E%D0%9E%D0%9D 0.16589512
%D0%97%D0%B2%D1%83%D0%BA+%D1%82%D1%80%D0%B5%D0%B2%D0%BE%D0%B3%D0%B8 0.16584124
%D0%AE%D0%BB%D0%B8%D1%8F+%D0%92%D0%B4%D0%BE%D0%B2%D0%B5%D0%BD%D0%BA%D0%BE 0.16583566
%D0%9A%D0%BE%D0%BB%D0%B4%D0%BE%D0%B2%D1%81%D0%BA%D0%B0%D1%8F+%D0%BB%D1%8E%D0%B1%D0%BE%D0%B2%D1%8C 0.1658326
%D0%97%D0%B2%D1%83%D0%BA++%D0%A1%D0%B8%D0%B3%D0%BD%D0%B0%D0%BB+%D0%B2%D0%BE%D0%B7%D0%B4%D1%83%D1%88%D0%BD%D0%BE%D0%B9+%D1%82%D1%80%D0%B5%D0%B2%D0%BE%D0%B3%D0%B8 0.1658227
%D0%A1%D0%B8%D0%B3%D0%BD%D0%B0%D0%BB+%D0%A2%D1%80%D0%B5%D0%B2%D0%BE%D0%B3%D0%B8 0.1657872
Boney+M. 0.16522874
Roxette 0.16507366
Survivor 0.16334596
Gazebo 0.16299067
Secret+Service 0.16133526
C

******

In [65]:
#rap
people_dict['211094']

'Mac+Miller+feat.+Action+Bronson'

In [66]:
sim_items = model1.similar_items(person_code_dict['211094'], N=20)[1:]
for item, score in sim_items:
    print(people_dict[code_person_dict[item]], score)

Wiz+Khalifa+&+John+Cena 0.51475304
Schoolboy+Q. 0.51094645
Cam+Meekins 0.5007237
Keef+in+NYC+%2F+Chiraq%27s+New+Kids 0.49827093
Alien+vs.+Predator+vs.+Chief+Keef 0.4982655
YG 0.498219
Trap+Migos 0.49758837
Asher+Roth 0.4968966
2+Chainz+&+Future 0.49637565
Juvenile+Feat.+Mannie+Fresh+&+Lil%27+Wayne 0.49570483
Ty+Dolla+$ign+On+How+His+Videos+Make+Girls+Pregnant 0.4956009
Mike+Jones+feat.+Slim+Thug+&+Paul+Wall 0.49558532
E-40+Featuring+Clipse 0.4955613
Ace+Hood%2FFuture%2FRick+Ross 0.49536103
Kirko+Bangz+Ft.+Z-Ro,+Paul+Wall+&+Slim+Thug 0.4951949
Nardwuar+vs.+Ab 0.49518555
Stalley+ft+Rick+Ross+x+August+Alsina 0.49513942
Eazy+Money 0.4950797
pokepooh 0.495071
