In [1]:
import pandas as pd
import numpy as np

In [2]:
import implicit

In [3]:
import pickle

In [4]:
from scipy.sparse import coo_matrix

In [5]:
from utils.CF_recommender_utils import recommend_by_average, calculate_hit_rate, recommend_by_user
from utils.metrics import *

In [6]:
! export MKL_NUM_THREADS=1

### 1. Load prepared data

In [7]:
train_df = pd.read_pickle('data/train_mod.pickle')
test_df = pd.read_pickle('data/test_grouped.pickle')
left_out_df = pd.read_pickle('data/left_out_df.pickle')

In [8]:
left_out_df

Unnamed: 0,user_id,person_id
31125591,40215,164287
2185310,28749,42398
16687006,8046,367332
19484714,8749,21753
13252169,7863,450200
...,...,...
5723175,25238,46425
445058,1858,46425
9482149,23901,78622
7051855,5381,274779


In [9]:
#count number of interactions for each user and artist
train_df2 = train_df[['user_id', 'person_id']].groupby(['user_id', 'person_id']).size().reset_index(name='plays')

In [10]:
train_df2

Unnamed: 0,user_id,person_id,plays
0,1,104136,72
1,1,11467,63
2,1,11617,13
3,1,153765,32
4,1,168705,40
...,...,...,...
5907655,45174,83375,1
5907656,45174,86122,1
5907657,45174,87296,1
5907658,45174,87766,1


In [11]:
left_out_df = left_out_df[left_out_df['user_id'].isin(set(train_df2['user_id']))]

### 2. Prepare artists and left-out set dictionaries

In [12]:
people_df = pd.read_pickle('data/new_persons_df.pickle')

In [13]:
people_df['person_id'] = people_df['person_id'].astype(str)

In [14]:
len(people_df['person_id'].unique())

560927

In [15]:
people_dict = pd.Series(people_df['person_name'].values, index=people_df['person_id']).to_dict()

In [16]:
people_dict['145148']

'Everything+Is+Illuminated'

**************

In [17]:
left_out_dict = pd.Series(left_out_df['person_id'].values, index=left_out_df['user_id']).to_dict()

In [18]:
left_out_dict[13693]

'432322'

### 3. Collaborative filtering for implicit feedback data

In [19]:
#user and person ids to categorical
train_df2['user_id'] = train_df2['user_id'].astype("category")
train_df2['person_id'] = train_df2['person_id'].astype("category")

In [20]:
train_df2.head()

Unnamed: 0,user_id,person_id,plays
0,1,104136,72
1,1,11467,63
2,1,11617,13
3,1,153765,32
4,1,168705,40


In [21]:
#save person codes and construct useful dictionaries
person_codes = train_df2['person_id'].cat.codes
user_codes = train_df2['user_id'].cat.codes

code_person_dict = dict(zip(person_codes, train_df2['person_id']))
person_code_dict = dict(zip(train_df2['person_id'], person_codes))

code_user_dict = dict(zip(user_codes, train_df2['user_id']))
user_code_dict = dict(zip(train_df2['user_id'], user_codes))

In [22]:
# get persons ids to later use it for person vectors dictionary
code_person_dict_items = [x[1] for x in sorted(code_person_dict.items())]

In [23]:
# create a sparse matrix of artist/user/n_plays 
artist_user_matrix = coo_matrix((train_df2['plays'].astype(float), 
                                (person_codes, 
                                 user_codes)))


In [24]:
artist_user_matrix.shape

(515111, 36139)

Initialize several models with different parameters (manual hyperparameter tuning)

In [25]:
# initialize a model
model1 = implicit.als.AlternatingLeastSquares(factors=50, iterations=30)
model2 = implicit.als.AlternatingLeastSquares(factors=100, iterations=30)



In [26]:
model1.fit(artist_user_matrix)

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




In [27]:
model2.fit(artist_user_matrix)

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




In [28]:
model1.item_factors

array([[ 2.3767738e-04, -1.9776321e-03, -2.3199029e-03, ...,
         3.6011722e-03,  5.7390769e-04,  1.5309113e-03],
       [ 1.3070554e-04, -1.0336972e-04, -8.3678649e-05, ...,
         8.2720899e-06, -1.5551643e-05,  8.9416499e-06],
       [ 1.5553256e-04,  7.8542238e-05,  3.3827714e-06, ...,
         5.8673973e-05, -7.9712110e-05,  6.4117557e-05],
       ...,
       [ 1.8496075e-04,  3.5762598e-04,  9.5885262e-05, ...,
         2.8970922e-04, -2.9832865e-05,  2.5424601e-05],
       [ 3.4473944e-04, -6.0299260e-04, -8.8563265e-04, ...,
         1.0696728e-03,  3.0537942e-04,  5.8903394e-04],
       [-1.6431708e-02, -1.3337076e-02,  1.1959351e-02, ...,
         3.5670795e-02, -3.0058967e-03,  2.6921444e-02]], dtype=float32)

### 4. Calculate hit rate

Claculate hit rate using left-one-out data in order to decide which model is better

In [29]:
def calculate_hit_rate(left_out_dict, user_ids_lst, top_20_recommended_ids):
    '''
    Claculated hit rate for top 20 using left-one-out set
    '''
    hit_rate = 0
    total_users = len(user_ids_lst)
    
    for user, ids_lst in zip(user_ids_lst, top_20_recommended_ids):
        #print(user)
        #print(left_out_dict[user])
        #print(ids_lst)
        if left_out_dict[user] in ids_lst:
            hit_rate += 1
    return hit_rate/total_users

In [30]:
user_artist_matrix = artist_user_matrix.T.tocsr()

In [31]:
%%time
recs1 = list(left_out_df.apply(lambda row: recommend_by_user(model1, row, 
                                                             user_code_dict,
                                                             code_person_dict,
                                                             user_artist_matrix), axis=1))

CPU times: user 6min 38s, sys: 12.7 ms, total: 6min 38s
Wall time: 1min 39s


In [32]:
hit_rate1 = calculate_hit_rate(left_out_dict, list(left_out_df['user_id']), recs1)
hit_rate1

0.22955583229555832

In [33]:
%%time
recs2 = list(left_out_df.apply(lambda row: recommend_by_user(model2, row, 
                                                             user_code_dict,
                                                             code_person_dict,
                                                             user_artist_matrix), axis=1))

CPU times: user 10min 42s, sys: 15.1 ms, total: 10min 42s
Wall time: 2min 40s


In [34]:
hit_rate2 = calculate_hit_rate(left_out_dict, list(left_out_df['user_id']), recs2)
hit_rate2

0.23868825238688252

For collaborative filtering, hit rates for model1 and model2 equal to 0. It means that the model bad at predicting items that that user didn't interact with before. 

### 5. Make recommendations for test set and calculate MAPk and recall

In [35]:
artists_vectors = dict(zip(code_person_dict_items, model2.item_factors))

In [36]:
artists_vectors_df = pd.DataFrame(columns=['person_id', 'vector'])
artists_vectors_df['person_id'] = code_person_dict_items
artists_vectors_df['vector'] = list(model2.item_factors)

In [37]:
artists_vectors_df.head()

Unnamed: 0,person_id,vector
0,1,"[0.0046833134, 0.0052596247, 0.0060501643, 0.0..."
1,10,"[0.00451187, 0.0023319805, 0.0023501331, 0.004..."
2,1000,"[0.0047891326, 0.0023434267, 0.002760814, 0.00..."
3,10000,"[0.0047093695, 0.0025743272, 0.0028270192, 0.0..."
4,100000,"[-0.0027736675, 0.008846769, 0.019597149, 0.00..."


In [38]:
def separate_pers_lst(row):
    persons_lst = list(set(row['persons_lst']))
    n = len(persons_lst)    
    l = persons_lst[:n//2]
    r = persons_lst[n//2:]
    d = {'persons_lst': l,
         'right_lst' : r}
    return d   

In [39]:
test_df_sep = pd.DataFrame(list(test_df.apply(lambda row: separate_pers_lst(row), axis=1)))
test_df_sep['user_id'] = test_df['user_id']

In [40]:
test_df_sep

Unnamed: 0,persons_lst,right_lst,user_id
0,"[70886, 259767, 382004, 294948, 33239, 436802,...","[354658, 302485, 409053, 162669, 189203, 26467...",2
1,"[272632, 294652, 13342, 304147, 304149, 309593...","[54493, 304141, 281959, 112558, 105529, 304150...",6
2,"[378590, 400535, 294583, 74637, 363398, 124222...","[351629, 330635, 106304, 50495, 231436, 427089...",9
3,[302401],[263881],10
4,"[245929, 172300, 137066, 40664, 414988, 36874,...","[385051, 164555, 302609, 307609, 212650, 39741...",19
...,...,...,...
9030,"[438992, 211585, 20674, 538952, 174298, 538954...","[269833, 34180, 311224, 538951, 383340, 36823,...",45138
9031,"[448992, 381567, 387785, 537919, 370672, 97923...","[440590, 248381, 143048, 11309, 357494, 398286...",45147
9032,"[29661, 16783, 117659, 192393, 190028, 275082,...","[549895, 54198, 218992, 549894, 335800, 129, 1...",45148
9033,"[427743, 123799, 247367, 138443, 255583, 40851...","[120181, 345801, 148559, 110366, 221303, 52520...",45172


In [41]:
test_df.shape

(9035, 2)

In [42]:
%%time
top_20_ids = list(test_df_sep.apply(lambda row: recommend_by_average(row, artists_vectors_df), axis=1))

CPU times: user 4h 6min 34s, sys: 19min 36s, total: 4h 26min 11s
Wall time: 1h 51min 46s


In [43]:
y_true = test_df_sep['right_lst']
y_pred = top_20_ids

In [44]:
mapk_scores = []

for t, p in zip(y_true, y_pred):
    mapk_scores.append(MAPk(t,p))

In [45]:
np.mean(mapk_scores)

0.025194398152964474

In [46]:
rec = []

for t, p in zip(y_true, y_pred):
    rec.append(recall(t,p))

In [47]:
np.mean(rec)

0.057066397317200854

In [48]:
scores_dict = {'mapk': mapk_scores,
               'recall': rec}

In [49]:
a_file = open("CF_savings/scores_dict_cf.pkl", "wb")
pickle.dump(scores_dict, a_file)
a_file.close()

### 6. List top-20 most similar artists for some artists

In [50]:
#pop
people_dict['29692']

'Ariana+Grande'

In [51]:
sim_items = model2.similar_items(person_code_dict['29692'], N=21)[1:]
for item, score in sim_items:
    print(people_dict[code_person_dict[item]], score)

Meghan+Trainor 0.6882256
Jessie+J. 0.6794357
Iggy+Azalea+ft.+Charli+XCX 0.6700493
The+Fault+In+Our+Stars+I+Charli+XCX 0.59241873
Clean+Bandit+&+Jess+Glyne 0.5734758
Magic!+&+Zedd 0.57257116
ToveLoVEVO 0.57134956
Nicki+Minaj 0.5705345
Sam+Smith 0.56995296
Jason+Derulo 0.55801547
Katy+Perry+Ft+Juicy+J 0.5548027
Calvin+Harris+feat.+Florence+Welch 0.54130036
Sia+&+Beck 0.53977466
David+Guetta+&+Nicky+Romero 0.53765625
Pitbull+feat.+T-Pain+&+Sean+Paul 0.534914
Maroon+5+Ft.+Rihanna 0.529191
Selena+Gomez+&+Selena 0.51391065
Becky+G 0.50457126
Kiesza 0.49299043
One+Direction 0.48563552


************

In [52]:
#rock
people_dict['157384']

'Freddie+Mercury'

In [53]:
sim_items = model2.similar_items(person_code_dict['157384'], N=21)[1:]
for item, score in sim_items:
    print(people_dict[code_person_dict[item]], score)

Black 0.13345204
Bad+Boys+Blue 0.13065952
Gazebo 0.1304788
Ricchi+e+poveri 0.12982155
F.R.+David 0.12976216
Masterboy 0.12834346
Modern+Talking 0.12814285
Patrick+Swayze 0.12814005
Twenty+4+Seven 0.12773447
Limahl 0.12604512
Samantha+Fox 0.12597412
M+People 0.12593555
Dr.+Alban+&+Jessica+Folcker 0.125536
Blue+System 0.12529485
Katrina+and+the+Waves 0.12523259
Bonnie+Tyler+&+Meatloaf 0.12477777
The+Cross 0.124489
Tight+Fit 0.124375716
Tina+Turner+feat.+Sting 0.12407678
Smokie+&+Suzi+Quatro 0.124019444


************

In [54]:
#rap
people_dict['211094']

'Mac+Miller+feat.+Action+Bronson'

In [55]:
sim_items = model2.similar_items(person_code_dict['211094'], N=21)[1:]
for item, score in sim_items:
    print(people_dict[code_person_dict[item]], score)

Schoolboy+Q. 0.44531786
Asher+Roth 0.44493726
Ab-Soul+&+Jay+Rock 0.44266513
YG 0.43413195
Pusha+T 0.43391928
2+Chainz+&+Future 0.42097405
Logic+&+Last+Resort 0.4193839
Tyler+the+Creator 0.41901627
A$AP+Rocky 0.41446146
Domo+Genesis+&+The+Alchemist 0.41219902
Kid+Cudi+vs+Crookers 0.41112053
Young+Jeezy+&+Bun-B 0.40926802
J+Cole+&+Miguel 0.405432
Curren$y+&+Alchemist 0.39938217
A$AP+Ferg+ft.+A$AP+Rocky+&+Shabba+Ranks 0.39918393
Wiz+Khalifa+&+John+Cena 0.3980419
Flatbush+ZOMBiES+&+The+Underachievers 0.39765757
MellowHype 0.39396873
Earl+Sweatshirt 0.39297706
Chance+The+Rapper+&+The+Social+Experiment 0.3921629
