In [1]:
import numpy as np
import pandas as pd
import json
from sklearn.utils import shuffle

In [2]:
import pickle

In [3]:
from gensim.models import Word2Vec

In [4]:
from utils.item2vec_recommender_utils import hit_rate_evaluate, recommend, get_similar_artists
from utils.metrics import MAPk, precision, recall

In [5]:
import warnings
warnings.filterwarnings("ignore")

### 1. Load prepared data

In [6]:
train_grouped = pd.read_pickle('data/train_mod_grouped.pickle')
test_grouped = pd.read_pickle('data/test_grouped.pickle')
left_out_df = pd.read_pickle('data/left_out_df.pickle')

In [7]:
val_df = train_grouped[train_grouped['user_id'].isin(left_out_df['user_id'])]

In [8]:
train_grouped.head()

Unnamed: 0,user_id,persons_lst
0,1,"[307555, 104136, 42218, 171231, 171231, 171231..."
1,3,"[121274, 120533, 326647, 388057, 366613, 32664..."
2,4,"[37658, 15424, 6652, 280685, 58422, 460501, 19..."
3,5,"[19627, 19627, 19627, 19627, 19627, 456925, 40..."
4,7,"[295341, 238498, 271682, 172564, 203357, 39058..."


In [9]:
left_out_df.head()

Unnamed: 0,user_id,person_id
31125591,40215,164287
2185310,28749,42398
16687006,8046,367332
19484714,8749,21753
13252169,7863,450200


In [10]:
train_lst = list(train_grouped['persons_lst'])

### 2. Prepare artists names and left-out set dictionaries

In [11]:
people_df = pd.read_pickle('data/new_persons_df.pickle')

In [12]:
people_df

Unnamed: 0,person_id,person_name
0,145148,Everything+Is+Illuminated
1,297899,Robin+O%27Brien
2,250429,Nicholas+Gunn++(2012)
3,32765,Aspasia+Stratigou
4,18689,Allison+Veltz
...,...,...
560922,544215,Sanaa+Kariakoo
560923,298403,Rock-a-teens
560924,450896,Jennifer+Lopez+Ft.+DJ+Mustard
560925,53831,Bobby+Sanabria+Conducting+The+Manhattan+School...


In [13]:
people_df['person_id'] = people_df['person_id'].astype(str)

In [14]:
len(people_df['person_id'].unique())

560927

In [15]:
people_dict = pd.Series(people_df['person_name'].values, index=people_df['person_id']).to_dict()

In [16]:
people_dict['29692']

'Ariana+Grande'

*****************

In [17]:
left_out_dict = pd.Series(left_out_df['person_id'].values, index=left_out_df['user_id']).to_dict()

### 3. Train word2vec embeddings with different parameters

In [18]:
params1 = dict(size = 50,
               min_count = 1,
               window = 5,
               sg = 1,
               hs = 0)

params2 = dict(size = 50,
               min_count = 1,
               window = 5,
               sg = 1,
               hs = 1)

params3 = dict(size = 100,
               min_count = 1,
               window = 10,
               sg = 1,
               hs = 0)
params4 = dict(size = 50,
               min_count = 1,
               window = 10,
               sg = 1,
               hs = 0)
params5 = dict(size = 100,
               min_count = 1,
               window = 5,
               sg = 1,
               hs = 1)

params_lst = [params1, params2, params3, params4, params5]

In [19]:
%%time
for i, p in zip(range(len(params_lst)), params_lst):
    model = Word2Vec(**p)
    model.build_vocab(train_lst, progress_per=200)
    model.train(train_lst, total_examples = model.corpus_count, 
                epochs=10, report_delay=1)
    model.save('item2vec_savings/model' + str(i+1) + '.sav')
      

CPU times: user 7h 18min 1s, sys: 13.1 s, total: 7h 18min 14s
Wall time: 2h 27min 26s


In [20]:
model1 = Word2Vec.load('item2vec_savings/model1.sav')
model2 = Word2Vec.load('item2vec_savings/model2.sav')
model3 = Word2Vec.load('item2vec_savings/model3.sav')
model4 = Word2Vec.load('item2vec_savings/model4.sav')
model5 = Word2Vec.load('item2vec_savings/model5.sav')

In [21]:
print(model1)
print(model2)
print(model3)
print(model4)
print(model5)

Word2Vec(vocab=515111, size=50, alpha=0.025)
Word2Vec(vocab=515111, size=50, alpha=0.025)
Word2Vec(vocab=515111, size=100, alpha=0.025)
Word2Vec(vocab=515111, size=50, alpha=0.025)
Word2Vec(vocab=515111, size=100, alpha=0.025)


In [22]:
models = [model1, model2, model3, model4, model5]

### 4. Hit rate evaluation

Claculate hit rate using left-one-out data in order to decide which model is better

In [23]:
%%time
hit_rates = []

for model in models:
    hit_rate = hit_rate_evaluate(model, val_df, left_out_dict, people_dict)
    hit_rates.append(hit_rate)
    

Recommendations calculated
Starting hit rate calculation
Recommendations calculated
Starting hit rate calculation
Recommendations calculated
Starting hit rate calculation
Recommendations calculated
Starting hit rate calculation
Recommendations calculated
Starting hit rate calculation
CPU times: user 1h 25min 34s, sys: 1.33 s, total: 1h 25min 35s
Wall time: 26min 26s


In [24]:
print(hit_rates)

[0.0012453300124533001, 0.007195240071952401, 0.00041511000415110004, 0.0009685900096859, 0.008717310087173101]


Model 5 gave the best hit rate

### 5. Make recommendations for test set and calculate MAPkand recall

In [25]:
test_grouped

Unnamed: 0,user_id,persons_lst
0,2,"[317952, 317952, 317952, 307932, 147326, 33277..."
1,6,"[16163, 427911, 427911, 347719, 81351, 81351, ..."
2,9,"[56683, 208667, 144662, 299571, 28752, 356730,..."
3,10,"[263881, 302401]"
4,19,"[15830, 254536, 305083, 2589, 75802, 75802, 75..."
...,...,...
9030,45138,"[323344, 323344, 323344, 323344, 411805, 41180..."
9031,45147,"[228324, 320708, 348909, 348909, 437722, 43772..."
9032,45148,"[54198, 218992, 190028, 192393, 335800, 288031..."
9033,45172,"[196920, 11653, 123799, 138443, 345801, 408513..."


In [26]:
def separate_pers_lst(row):
    persons_lst = list(set(row['persons_lst']))
    n = len(persons_lst)    
    l = persons_lst[:n//2]
    r = persons_lst[n//2:]
    d = {'persons_lst': l,
         'right_lst' : r}
    return d   

In [27]:
test_df_sep = pd.DataFrame(list(test_grouped.apply(lambda row: separate_pers_lst(row), axis=1)))
test_df_sep['user_id'] = test_grouped['user_id']

In [28]:
%%time
top_20_names, top_20_ids = recommend(model5, test_df_sep, people_dict)

CPU times: user 11min 36s, sys: 43.9 ms, total: 11min 36s
Wall time: 2min 54s


In [29]:
y_true = test_df_sep['right_lst']
y_pred = top_20_ids

In [30]:
mapk_scores = []

for t, p in zip(y_true, y_pred):
    mapk_scores.append(MAPk(t,p))

In [31]:
np.mean(mapk_scores)

0.010763071704202754

In [32]:
rec = []

for t, p in zip(y_true, y_pred):
    rec.append(recall(t,p))

In [33]:
np.mean(rec)

0.023103183039375138

In [34]:
scores_dict = {'mapk': mapk_scores,
               'recall': rec}

In [35]:
a_file = open("item2vec_savings/scores_dict.pkl", "wb")
pickle.dump(scores_dict, a_file)
a_file.close()

### 6. List top-20 most similar artists for some artists

In [36]:
#pop singer
people_dict['29692']

'Ariana+Grande'

In [37]:
names, ids = get_similar_artists(model5, model5['29692'], people_dict)
names

[('Jessie+J,+Ariana+Grande+&+Nicki+Minaj', 0.8654477596282959),
 ('Becky+G', 0.8418684005737305),
 ('Today%27s+Hits', 0.8324069976806641),
 ('Rixton', 0.8288383483886719),
 ('Jessie+J.', 0.8148522973060608),
 ('Fifth+Harmony', 0.8115994334220886),
 ('Jessie+J,+Ariana+Grande,+Nicki+Minaj', 0.8080220222473145),
 ('Ke$ha+%22Die+Young%22+cover+by+Becky+G', 0.8075494766235352),
 ('Iggy+Azalea+ft.+Charli+XCX', 0.8039587736129761),
 ('G.R.L.', 0.8032480478286743),
 ('AMTAG_60', 0.8020125031471252),
 ('Meghan+Trainor', 0.7964286208152771),
 ('Lilly+Wood+feat.+Robin+Schulz', 0.7908823490142822),
 ('Britney+Spears+%2F+Jessie+J,+Ariana+Grande,+&+Nicki+Minaj',
  0.7888354659080505),
 ('Clean+Bandit+&+Jess+Glyne', 0.7877906560897827),
 ('AJR', 0.7855857014656067),
 ('David+Guetta+Ft.+Trey+Songz,+Chris+Brown+&+Sam+Martin', 0.7845808267593384),
 ('DJ+Snake+Machine', 0.780626654624939),
 ('The+Cherrybombs', 0.7796391844749451),
 ('Nick+Jonas+&+The+Administration', 0.7795069813728333)]

**************

In [38]:
#rock 
people_dict['157384']

'Freddie+Mercury'

In [39]:
names, ids = get_similar_artists(model5, model5['157384'], people_dict)
names

[('Queen+&+The+Muppets', 0.8283079266548157),
 ('Tommy+Fogerty+And+The+Blue+Vel', 0.7396383881568909),
 ('Lars+Van+Kampt', 0.7183437943458557),
 ('Michael+Jackson+%2F+Slash+(Guns+N%27+Roses)', 0.7151082754135132),
 ('Slavko+Avsenik+Ml.', 0.709837019443512),
 ('Mc+Laren,+Malcom', 0.7097668647766113),
 ('The+Cross', 0.7066447734832764),
 ('The+Eighty+Ballad+Group', 0.702889084815979),
 ('Winwood,+Steve+and+Spencer+Davis+Group', 0.7018457055091858),
 ('Peter+Green+featuring+Paul+Rogers', 0.6989554166793823),
 ('Nazareth', 0.6963227987289429),
 ('Matt,Jayson', 0.6960425972938538),
 ('Mylene+Farmer', 0.6958197355270386),
 ('010+Deep+Purple', 0.6891721487045288),
 ('%C3%A0%C3%A5%C3%B8+%C3%AB%C3%B9%C3%A3%C3%A9%C3%AD', 0.688783586025238),
 ('Thomas,+Timmy', 0.6878454685211182),
 ('Gary+Moore+feat.+Phil+Lynott', 0.6878280639648438),
 ('Van+Hallen', 0.6850995421409607),
 ('Ivo+Pessoa', 0.6844097971916199),
 ('%C3%A3%C3%A5%C3%A9%C3%A3+%C3%A3%C3%A5%C3%B8', 0.6842895746231079)]

*****************

In [40]:
#rap
people_dict['211094']

'Mac+Miller+feat.+Action+Bronson'

In [41]:
names, ids = get_similar_artists(model5, model5['211094'], people_dict)
names

[('Oskar+Koch', 0.7505506873130798),
 ('Mac+Miller%2FEarl+Sweatshirt%2FVinny+Radio', 0.7473846673965454),
 ('Godemis', 0.744239091873169),
 ('Mac+Dre-N-Andre+Nickatina', 0.7439322471618652),
 ('Mac+Miller%2FTyler,+The+Creator', 0.7397385835647583),
 ('Badnewz', 0.7277829647064209),
 ('Ab-Soul+ft+Kendrick+Lamar', 0.7273165583610535),
 ('The+weeknd,+Drake,+Trey+Songz,+Jhene+Aiko,+Schoolboy+Q,+Dafrican,+Chris+brown,+August+Alsina,+PartyNextDoor,+Rick+Ross,+Usher',
  0.7236860990524292),
 ('Kanye+West+Feat.+DJ+Premier', 0.7222352623939514),
 ('Mac+Miller;+Vinny+Radio', 0.7181921005249023),
 ('Pusha+T', 0.7145887613296509),
 ('Vic+Mensa.', 0.7141709327697754),
 ('The+Game+feat.+Tyler+The+Creator+&+Lil+Wayne', 0.7115122079849243),
 ('Laura+Zita', 0.7111826539039612),
 ('N.E.R.D+ft.+Kanye+West,+Lupe+Fiasco,+Pusha+T', 0.7087520360946655),
 ('Ty+Dolla+$ign+(feat.+Wiz+Khalifa)', 0.7079854011535645),
 ('Domo+Genesis+&+The+Alchemist', 0.7063467502593994),
 ('J+Cole%2F@Fauntleroy', 0.70581632852554