In [1]:
import pandas as pd
import numpy as np

In [2]:
import implicit

In [3]:
import pickle

In [4]:
from scipy.sparse import coo_matrix

In [5]:
from utils.CF_recommender_utils import recommend_by_average, calculate_hit_rate, recommend_by_user
from utils.metrics import *

In [6]:
! export MKL_NUM_THREADS=1

### 1. Load prepared data

In [7]:
train_df = pd.read_pickle('data/train_mod.pickle')
test_df = pd.read_pickle('data/test_grouped.pickle')
left_out_df = pd.read_pickle('data/left_out_df.pickle')
likes_df = pd.read_pickle('data/likes_df.pickle')

In [8]:
#count number of interactions for each user and artist
train_df2 = train_df[['user_id', 'person_id']].groupby(['user_id', 'person_id']).size().reset_index(name='plays')

In [9]:
likes_df['person_id'] = likes_df['person_id'].astype(str)

In [10]:
train_df2

Unnamed: 0,user_id,person_id,plays
0,1,104136,72
1,1,11467,63
2,1,11617,13
3,1,153765,32
4,1,168705,40
...,...,...,...
5907655,45174,83375,1
5907656,45174,86122,1
5907657,45174,87296,1
5907658,45174,87766,1


In [11]:
likes_df

Unnamed: 0,user_id,person_id,likes_count
0,1,87999,2
1,2,184769,1
2,2,408273,2
3,2,459046,2
4,2,464952,1
...,...,...,...
874471,45174,338132,1
874472,45174,349288,1
874473,45174,357381,2
874474,45174,378192,1


In [12]:
train_df2 = train_df2.merge(likes_df, on = ['user_id', 'person_id'], how='outer').dropna(subset = ['plays'])

In [13]:
train_df2['likes_count'] = train_df2['likes_count'].fillna(0)

In [14]:
train_df2

Unnamed: 0,user_id,person_id,plays,likes_count
0,1,104136,72.0,0.0
1,1,11467,63.0,0.0
2,1,11617,13.0,0.0
3,1,153765,32.0,0.0
4,1,168705,40.0,0.0
...,...,...,...,...
5907655,45174,83375,1.0,0.0
5907656,45174,86122,1.0,0.0
5907657,45174,87296,1.0,0.0
5907658,45174,87766,1.0,0.0


In [15]:
train_df2['likes_count'].value_counts()

0.0      5101449
1.0       574610
2.0       109058
3.0        44095
4.0        23176
          ...   
90.0           1
89.0           1
88.0           1
81.0           1
271.0          1
Name: likes_count, Length: 135, dtype: int64

In [16]:
train_df2['plays'].value_counts()

1.0       3141049
2.0        951537
3.0        427069
4.0        259400
5.0        166722
           ...   
914.0           1
907.0           1
906.0           1
902.0           1
8604.0          1
Name: plays, Length: 1099, dtype: int64

In [17]:
max(train_df2['likes_count'])

343.0

In [18]:
max(train_df2['plays'])

11319.0

In [19]:
np.mean(train_df2['likes_count'])

0.2646103533378698

In [20]:
np.mean(train_df2['plays'])

4.949872030550167

In [21]:
np.mean(train_df2['plays'])/np.mean(train_df2['likes_count'])

18.70626741588559

I'm trying to enhance CF model perdormance by enhancing it with user likes. For that, I am creating a new column 'plays_and_likes', where I sum up columns plays and likes with different weights: every play has weight=1 and every like has weight=20 since it is a very strong indication that user liked this artist.

In [22]:
train_df2['plays_and_likes'] = train_df2['plays'] + 20*train_df2['likes_count']

In [23]:
train_df2

Unnamed: 0,user_id,person_id,plays,likes_count,plays_and_likes
0,1,104136,72.0,0.0,72.0
1,1,11467,63.0,0.0,63.0
2,1,11617,13.0,0.0,13.0
3,1,153765,32.0,0.0,32.0
4,1,168705,40.0,0.0,40.0
...,...,...,...,...,...
5907655,45174,83375,1.0,0.0,1.0
5907656,45174,86122,1.0,0.0,1.0
5907657,45174,87296,1.0,0.0,1.0
5907658,45174,87766,1.0,0.0,1.0


In [24]:
max(train_df2['plays_and_likes'])

11739.0

In [25]:
train_df2[train_df2['likes_count']!=0]

Unnamed: 0,user_id,person_id,plays,likes_count,plays_and_likes
26,1,87999,41.0,2.0,81.0
29,3,110487,3.0,2.0,43.0
31,3,114541,3.0,1.0,23.0
33,3,120527,68.0,11.0,288.0
34,3,120533,61.0,5.0,161.0
...,...,...,...,...,...
5907592,45174,378192,1.0,1.0,21.0
5907602,45174,403129,1.0,1.0,21.0
5907640,45174,5399,22.0,1.0,42.0
5907643,45174,57717,1.0,1.0,21.0


In [26]:
left_out_df = left_out_df[left_out_df['user_id'].isin(set(train_df2['user_id']))]

### 2. Prepare artists and left-out set dictionaries

In [27]:
people_df = pd.read_pickle('data/new_persons_df.pickle')

In [28]:
people_df['person_id'] = people_df['person_id'].astype(str)

In [29]:
len(people_df['person_id'].unique())

560927

In [30]:
people_dict = pd.Series(people_df['person_name'].values, index=people_df['person_id']).to_dict()

In [31]:
people_dict['145148']

'Everything+Is+Illuminated'

In [32]:
left_out_dict = pd.Series(left_out_df['person_id'].values, index=left_out_df['user_id']).to_dict()

In [33]:
left_out_dict[13693]

'432322'

### 3. Collaborative filtering enhanced with user likes

In [34]:
#user and person ids to categorical
train_df2['user_id'] = train_df2['user_id'].astype("category")
train_df2['person_id'] = train_df2['person_id'].astype("category")

In [35]:
#save person codes and construct useful dictionaries
person_codes = train_df2['person_id'].cat.codes
user_codes = train_df2['user_id'].cat.codes

code_person_dict = dict(zip(person_codes, train_df2['person_id']))
person_code_dict = dict(zip(train_df2['person_id'], person_codes))

code_user_dict = dict(zip(user_codes, train_df2['user_id']))
user_code_dict = dict(zip(train_df2['user_id'], user_codes))

In [36]:
# get persons ids to later use it for person vectors dictionary
code_person_dict_items = [x[1] for x in sorted(code_person_dict.items())]

In [37]:
# create a sparse matrix of artist/user/n_plays 
artist_user_matrix = coo_matrix((train_df2['plays_and_likes'].astype(float), 
                                (person_codes, 
                                 user_codes)))

In [38]:
artist_user_matrix.shape

(515111, 36139)

In [39]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=100, iterations=30)



In [40]:
model.fit(artist_user_matrix)

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




### 4. Claculate hit rate

In [41]:
def calculate_hit_rate(left_out_dict, user_ids_lst, top_20_recommended_ids):
    '''
    Claculated hit rate for top 20 using left-one-out set
    '''
    hit_rate = 0
    total_users = len(user_ids_lst)
    
    for user, ids_lst in zip(user_ids_lst, top_20_recommended_ids):
        if left_out_dict[user] in ids_lst:
            hit_rate += 1
    return hit_rate/total_users

In [42]:
user_artist_matrix = artist_user_matrix.T.tocsr()

In [43]:
%%time
recs = list(left_out_df.apply(lambda row: recommend_by_user(model, row, 
                                                            user_code_dict,
                                                            code_person_dict,
                                                            user_artist_matrix), axis=1))

CPU times: user 10min 39s, sys: 30.2 ms, total: 10min 39s
Wall time: 2min 39s


In [44]:
hit_rate = calculate_hit_rate(left_out_dict, list(left_out_df['user_id']), recs)
hit_rate

0.22761865227618652

### 5. Make recommendations for test set and calculate MAPk and recall

In [45]:
artists_vectors = dict(zip(code_person_dict_items, model.item_factors))

In [46]:
artists_vectors_df = pd.DataFrame(columns=['person_id', 'vector'])
artists_vectors_df['person_id'] = code_person_dict_items
artists_vectors_df['vector'] = list(model.item_factors)

In [47]:
def separate_pers_lst(row):
    persons_lst = list(set(row['persons_lst']))
    n = len(persons_lst)    
    l = persons_lst[:n//2]
    r = persons_lst[n//2:]
    d = {'persons_lst': l,
         'right_lst' : r}
    return d   

In [48]:
test_df_sep = pd.DataFrame(list(test_df.apply(lambda row: separate_pers_lst(row), axis=1)))
test_df_sep['user_id'] = test_df['user_id']

In [49]:
test_df_sep

Unnamed: 0,persons_lst,right_lst,user_id
0,"[264674, 48535, 309614, 467871, 42756, 152747,...","[455930, 259767, 375103, 377399, 121264, 32037...",2
1,"[231926, 304139, 141343, 427911, 437579, 11255...","[266686, 294652, 13342, 238555, 304150, 155427...",6
2,"[334039, 351629, 363398, 376150, 103538, 31359...","[56683, 28752, 243451, 138523, 356730, 138986,...",9
3,[263881],[302401],10
4,"[283220, 343177, 305244, 296059, 391352, 30091...","[81646, 40664, 265414, 212650, 208177, 217047,...",19
...,...,...,...
9030,"[174298, 538948, 298627, 66215, 538953, 223470...","[202870, 76030, 277998, 398752, 62018, 538955,...",45138
9031,"[537943, 537933, 137881, 11309, 537935, 91847,...","[463529, 416199, 537930, 113736, 537945, 39828...",45147
9032,"[129, 192393, 549895, 275082, 16783, 549894, 2...","[54198, 19895, 190028, 218992, 117659, 335800,...",45148
9033,"[427743, 11653, 221303, 247367, 382722, 218516...","[138443, 408513, 450645, 255583, 52520, 120181...",45172


In [50]:
%%time
top_20_ids = list(test_df_sep.apply(lambda row: recommend_by_average(row, artists_vectors_df), axis=1))

CPU times: user 4h 8min 47s, sys: 21min 22s, total: 4h 30min 10s
Wall time: 1h 52min 57s


In [51]:
y_true = test_df_sep['right_lst']
y_pred = top_20_ids

In [52]:
mapk_scores = []

for t, p in zip(y_true, y_pred):
    mapk_scores.append(MAPk(t,p))

In [53]:
np.mean(mapk_scores)

0.024464442441922623

In [54]:
rec = []

for t, p in zip(y_true, y_pred):
    rec.append(recall(t,p))

In [55]:
np.mean(rec)

0.05716726677431336

In [56]:
scores_dict = {'mapk': mapk_scores,
               'recall': rec}

In [57]:
a_file = open("CF_savings/scores_dict_cf_enhanced.pkl", "wb")
pickle.dump(scores_dict, a_file)
a_file.close()

### 6. List top-20 most similar artists for some artists

In [58]:
#pop
people_dict['29692']

'Ariana+Grande'

In [59]:
sim_items = model.similar_items(person_code_dict['29692'], N=21)[1:]
for item, score in sim_items:
    print(people_dict[code_person_dict[item]], score)

Iggy+Azalea+ft.+Charli+XCX 0.3880091
Meghan+Trainor 0.36393252
Jessie+J. 0.3590561
Katy+Perry+Ft+Juicy+J 0.3387313
The+Fault+In+Our+Stars+I+Charli+XCX 0.33847886
Clean+Bandit+&+Jess+Glyne 0.33846432
Sam+Smith 0.33178425
Nicki+Minaj 0.32367924
Jason+Derulo 0.32345107
Pitbull+feat.+T-Pain+&+Sean+Paul 0.31486762
ToveLoVEVO 0.3143407
Calvin+Harris+feat.+Florence+Welch 0.2979112
One+Direction 0.29412168
Magic!+&+Zedd 0.29390973
Kiesza 0.28787386
David+Guetta+&+Nicky+Romero 0.2874706
Maroon+5+Ft.+Rihanna 0.28434008
Taylor+Swift+&+Def+Leppard 0.2787992
Miley+Cyrus+&+Cobra+Startship 0.27597445
Chris+Brown+&+Trey+Songz 0.2722841


********************

In [60]:
#rock
people_dict['157384']

'Freddie+Mercury'

In [61]:
sim_items = model.similar_items(person_code_dict['157384'], N=21)[1:]
for item, score in sim_items:
    print(people_dict[code_person_dict[item]], score)

Jason+Donovan 0.15838493
Cochi+&+Renato 0.15524846
F.R.+David 0.15087274
Johnny+Hates+Jazz 0.14872801
The+Cross 0.14867647
Sam+Brown 0.1483022
Desireless+&+Operation+Of+The+Sun 0.14753541
Dwa+Plus+Jeden 0.14393507
Bruce+Willis+&++Danny+Aiello 0.14152995
%D0%9F%D1%91%D1%82%D1%80+%D0%98%D0%BB%D1%8C%D0%B8%D1%87+%D0%A7%D0%B0%D0%B9%D0%BA%D0%BE%D0%B2%D1%81%D0%BA%D0%B8%D0%B9 0.14083782
Samantha+Fox 0.14022592
Blue+System 0.13982084
The+Tourists 0.13967028
Brian+May 0.13928704
Glenn+Medeiros 0.13850962
Tracey+Ullman 0.13816538
Smokie+&+Suzi+Quatro 0.13485414
%D0%A4%D0%B5%D0%B9%D0%BD%D0%BC%D0%B0%D0%BD+%D0%A0%D0%B8%D1%87%D0%B0%D1%80%D0%B4 0.13448521
Bucks+Fizz 0.1343224
Garou&Daniel+Lavoie&Patrick+Fiori 0.13288593


*************

In [62]:
#rap
people_dict['211094']

'Mac+Miller+feat.+Action+Bronson'

In [63]:
sim_items = model.similar_items(person_code_dict['211094'], N=21)[1:]
for item, score in sim_items:
    print(people_dict[code_person_dict[item]], score)

Logic+&+Last+Resort 0.29666665
Wiz+Khalifa+&+John+Cena 0.29282784
YG 0.28724378
Schoolboy+Q. 0.2853296
A$AP+Rocky 0.28274632
Wale+&+DJ+Omega 0.2813783
Asher+Roth 0.28063124
Big+Sean 0.27811563
Ab-Soul+&+Jay+Rock 0.27701825
MGK 0.27424225
Tyga 0.27405158
Lil+Wayne+Feat.+Drake+&+Rick+Ross 0.271438
2+Chainz+&+Future 0.2713587
Young+Jeezy+&+Bun-B 0.26963753
The+Game 0.26730567
J+Cole+&+Miguel 0.2666804
Future+&+Young+Scooter 0.26617125
Chance+The+Rapper+&+The+Social+Experiment 0.266096
French+Montana 0.2649563
Casey+Veggies+&+Rockie+Fresh 0.2641952
