In [2]:
import numpy as np
import pandas as pd
import json
from sklearn.utils import shuffle

In [32]:
import pickle

In [3]:
from gensim.models import Word2Vec

In [38]:
from utils.item2vec_recommender_utils import hit_rate_evaluate, recommend, get_similar_artists
from utils.metrics import MAPk, precision, recall

In [42]:
import warnings
warnings.filterwarnings("ignore")

### 1. Load prepared data

In [5]:
train_grouped = pd.read_pickle('data/train_grouped.pickle')
test_grouped = pd.read_pickle('data/test_grouped.pickle')
left_out_df = pd.read_pickle('data/left_out_df.pickle')
train_lo = pd.read_pickle('data/train_lo_grouped.pickle')

In [6]:
train_grouped.head()

Unnamed: 0,user_id,persons_lst
0,1,"[383681, 42218, 307555, 211023, 104136, 104136..."
1,3,"[4052, 286091, 120527, 176110, 51868, 35695, 2..."
2,4,"[191173, 283933, 336778, 345237, 147566, 48766..."
3,5,"[19627, 19627, 19627, 19627, 19627, 306470, 24..."
4,7,"[238498, 295341, 271682, 172564, 390585, 14952..."


In [7]:
left_out_df.head()

Unnamed: 0,user_id,person_id,ts
2208857,25817,20735,1392140023
13120825,39916,382213,1411980800
19207096,19575,309884,1396261680
764280,4501,272420,1409545293
27871472,1463,385249,1418905945


In [8]:
train_lst = list(train_grouped['persons_lst'])

### 2. Prepare artists names and left-out set dictionaries

In [9]:
people_df = pd.read_pickle('data/new_persons_df.pickle')

In [10]:
people_df

Unnamed: 0,person_id,person_name
0,145148,Everything+Is+Illuminated
1,297899,Robin+O%27Brien
2,250429,Nicholas+Gunn++(2012)
3,32765,Aspasia+Stratigou
4,18689,Allison+Veltz
...,...,...
560922,544215,Sanaa+Kariakoo
560923,298403,Rock-a-teens
560924,450896,Jennifer+Lopez+Ft.+DJ+Mustard
560925,53831,Bobby+Sanabria+Conducting+The+Manhattan+School...


In [11]:
people_df['person_id'] = people_df['person_id'].astype(str)

In [12]:
len(people_df['person_id'].unique())

560927

In [13]:
people_dict = pd.Series(people_df['person_name'].values, index=people_df['person_id']).to_dict()

In [14]:
people_dict['29692']

'Ariana+Grande'

*****************

In [15]:
left_out_dict = pd.Series(left_out_df['person_id'].values, index=left_out_df['user_id']).to_dict()

### 3. Train word2vec embeddings with different parameters

Since the word2vec model training and hit rate evaluation takes time, I decided to train several models with different parameters manually (make manual hyperparameter tuning) and choose the best one. But if I had more computational power and time, I would automate this process (for example, using GridSearch or a custom algorithm). 

In [19]:
%%time
model1 = Word2Vec(size = 50, window = 10, sg = 1, hs = 0, min_count=1)

model1.build_vocab(train_lst, progress_per=200)

model1.train(train_lst, total_examples = model1.corpus_count, 
            epochs=5, report_delay=1)
model1.save('item2vec_savings/item2vec_s50_w10_5ep.sav')

CPU times: user 45min 43s, sys: 1.79 s, total: 45min 45s
Wall time: 15min 26s


In [20]:
%%time
model2 = Word2Vec(size = 50, window = 30, sg = 1, hs = 0, min_count=1)

model2.build_vocab(train_lst, progress_per=200)

model2.train(train_lst, total_examples = model2.corpus_count, 
            epochs=5, report_delay=1)
model2.save('item2vec_savings/item2vec_s50_w30_5ep.sav')

CPU times: user 1h 49min 56s, sys: 1.84 s, total: 1h 49min 58s
Wall time: 36min 50s


In [21]:
%%time
model3 = Word2Vec(size = 100, window = 10, sg = 1, hs = 0, min_count=1)

model3.build_vocab(train_lst, progress_per=200)

model3.train(train_lst, total_examples = model3.corpus_count, 
            epochs=5, report_delay=1)
model3.save('item2vec_savings/item2vec_s100_w10_5ep.sav')

CPU times: user 50min 11s, sys: 1.41 s, total: 50min 13s
Wall time: 16min 53s


In [50]:
%%time
model4 = Word2Vec(size = 50, window = 5, sg = 1, hs = 0, min_count=1)

model4.build_vocab(train_lst, progress_per=200)

model4.train(train_lst, total_examples = model4.corpus_count, 
            epochs=5, report_delay=1)
model4.save('item2vec_savings/item2vec_s50_w5_5ep.sav')

CPU times: user 26min 37s, sys: 1.7 s, total: 26min 38s
Wall time: 9min 3s


In [16]:
#model.init_sims(replace=True)

In [54]:
model1 = Word2Vec.load('item2vec_savings/item2vec_s50_w10_5ep.sav')
model2 = Word2Vec.load('item2vec_savings/item2vec_s50_w30_5ep.sav')
model3 = Word2Vec.load('item2vec_savings/item2vec_s100_w10_5ep.sav')
model4 = Word2Vec.load('item2vec_savings/item2vec_s50_w5_5ep.sav')

In [56]:
print(model1)
print(model2)
print(model3)
print(model4)

Word2Vec(vocab=515265, size=50, alpha=0.025)
Word2Vec(vocab=515265, size=50, alpha=0.025)
Word2Vec(vocab=515265, size=100, alpha=0.025)
Word2Vec(vocab=515265, size=50, alpha=0.025)


### 4. Hit rate evaluation

Claculate hit rate using left-one-out data in order to decide which model is better

In [59]:
%%time
hit_rate1 = hit_rate_evaluate(model1, train_lo, left_out_dict, people_dict)

Recommendations calculated
Starting hit rate calculation
CPU times: user 47min 1s, sys: 1.7 s, total: 47min 2s
Wall time: 12min 17s


In [60]:
%%time
hit_rate2 = hit_rate_evaluate(model2, train_lo, left_out_dict, people_dict)

Recommendations calculated
Starting hit rate calculation
CPU times: user 38min 38s, sys: 572 ms, total: 38min 38s
Wall time: 9min 46s


In [61]:
%%time
hit_rate3 = hit_rate_evaluate(model3, train_lo, left_out_dict, people_dict)

Recommendations calculated
Starting hit rate calculation
CPU times: user 53min 46s, sys: 823 ms, total: 53min 47s
Wall time: 13min 27s


In [57]:
%%time
hit_rate4 = hit_rate_evaluate(model4, train_lo, left_out_dict, people_dict) 

Recommendations calculated
Starting hit rate calculation
CPU times: user 39min 27s, sys: 776 ms, total: 39min 28s
Wall time: 10min 3s


In [62]:
print(hit_rate1)
print(hit_rate2)
print(hit_rate3)
print(hit_rate4)

0.0034893381334810303
0.005234007200221546
0.0022431459429520908
0.004707837164220438


Model2 gave the best hit rate

In [63]:
print(model2)

Word2Vec(vocab=515265, size=50, alpha=0.025)


### 5. Make recommendations for test set and calculate MAPk, precision and recall

In [64]:
%%time
top_20_names, top_20_ids = recommend(model2, test_grouped, people_dict)

CPU times: user 12min 12s, sys: 1.66 s, total: 12min 14s
Wall time: 3min 21s


In [65]:
y_true = test_grouped['persons_lst']
y_pred = top_20_ids

In [66]:
mapk_scores = []

for t, p in zip(y_true, y_pred):
    mapk_scores.append(MAPk(t,p))

In [67]:
np.mean(mapk_scores)

0.03134228564094112

In [68]:
prec = []

for t, p in zip(y_true, y_pred):
    prec.append(precision(t,p))

In [69]:
np.mean(prec)

0.018057553956834532

In [70]:
rec = []

for t, p in zip(y_true, y_pred):
    rec.append(recall(t,p))

In [71]:
np.mean(rec)

0.053591719382843475

Try to make the same for the second best model

In [74]:
%%time
top_20_names2, top_20_ids2 = recommend(model4, test_grouped, people_dict)

CPU times: user 12min 15s, sys: 523 ms, total: 12min 16s
Wall time: 3min 14s


In [84]:
y_true2 = test_grouped['persons_lst']
y_pred2 = top_20_ids2

In [86]:
mapk_scores2 = []

for t, p in zip(y_true2, y_pred2):
    mapk_scores2.append(MAPk(t,p))

In [87]:
np.mean(mapk_scores2)

0.043636070223545265

In [88]:
prec2 = []

for t, p in zip(y_true2, y_pred2):
    prec2.append(precision(t,p))

In [89]:
np.mean(prec2)

0.032302158273381294

In [90]:
rec2 = []

for t, p in zip(y_true2, y_pred2):
    rec2.append(recall(t,p))

In [91]:
np.mean(rec2)

0.07597972239961429

MAPk, recall and precision are better for the second best by hit rate model (model4). I decided to use model4 embeddings

In [92]:
scores_dict = {'mapk': mapk_scores2,
               'precision': prec2,
               'recall': rec2}

In [93]:
a_file = open("item2vec_savings/scores_dict.pkl", "wb")
pickle.dump(scores_dict, a_file)
a_file.close()

### 6. List top-20 most similar artists for some artists

In [94]:
#pop singer
people_dict['29692']

'Ariana+Grande'

In [95]:
names, ids = get_similar_artists(model4, model4['29692'], people_dict)
names

[('Becky+G', 0.9078719019889832),
 ('Meghan+Trainor', 0.9060198664665222),
 ('Jessie+J.', 0.9058918952941895),
 ('Fifth+Harmony', 0.9012002944946289),
 ('%C2%8Bk%C2%8Cc%C2%91%C2%BE', 0.8878846168518066),
 ('Iggy+Azalea+ft.+Charli+XCX', 0.8873310685157776),
 ('Sigma.', 0.8866012692451477),
 ('G.R.L.', 0.8837899565696716),
 ('Iggy+Azalea+(Featuing+M%C3%98)', 0.8813971281051636),
 ('Magic!+&+Zedd', 0.8811827898025513),
 ('Clean+Bandit+f%2FJess+Glynne', 0.8790188431739807),
 ('Born+This+Way', 0.8757053017616272),
 ('Jessie+J,+Ariana+Grande+&+Nicki+Minaj', 0.8739939332008362),
 ('Jessie+J+ft.+Ariana+Grande+&+Nick+Minaj', 0.873116135597229),
 ('Mr.+Probz,+Robin+Schulz', 0.871794581413269),
 ('Matisyahu,+Richello', 0.8713920712471008),
 ('Beyonc%C3%A9+ft.+Chimamanda+Ngozi+Adiche', 0.8678398728370667),
 ('Olly+Murs+feat.+Aliz%C3%A9e', 0.8677894473075867),
 ('Nicki+Minaj+vs+Gretchen+feat.+Garotas+da+Laje', 0.8674243688583374),
 ('Jessie+J+feat.+Ariana+Grande+&+Nicki+Minaj', 0.8665834069252014)]

**************

In [130]:
#rock 
people_dict['157384']

'Freddie+Mercury'

In [131]:
names, ids = get_similar_artists(model4, model4['157384'], people_dict)
names

[('Queen+&+The+Muppets', 0.8720742464065552),
 ('George+Michael+and+Queen', 0.8444976806640625),
 ('The+Cross', 0.8327374458312988),
 ('A+Caverna', 0.8240128755569458),
 ('Queen+%252B+Wyclef+Jean', 0.8156170845031738),
 ('Aerosmith', 0.8089699745178223),
 ('Kimnowak', 0.8084118366241455),
 ('Queen+%252B+Elton+John', 0.803917407989502),
 ('Vladimir+Vysotsky,+Melodia+&+G.+Garanyan', 0.799826979637146),
 ('Uriah+Heep', 0.7972726225852966),
 ('Amanda+Plummer', 0.7937545776367188),
 ('Tia+Carrere', 0.7937341332435608),
 ('Kennedy+Center+Honors+2012', 0.7930720448493958),
 ('Freddie+Mercury+%252B+Monserrat+Caball%C3%A9', 0.7897298336029053),
 ('Duran+Duran', 0.7892658710479736),
 ('Lenny+Kravitzz', 0.7886962294578552),
 ('Whitesnake', 0.7855861186981201),
 ('Jim+Horn', 0.7854337692260742),
 ('6.+Jay+Timberly', 0.7841383218765259),
 ('Def+Leopard', 0.783647894859314)]

*****************

In [141]:
#rap
people_dict['211094']

'Mac+Miller+feat.+Action+Bronson'

In [142]:
names, ids = get_similar_artists(model4, model4['211094'], people_dict)
names

[('Mac+Miller%2FEarl+Sweatshirt%2FVinny+Radio', 0.9166097640991211),
 ('A$AP+Ferg+featuring+A$AP+Rocky,+French+Montana,+Trinidad+James+&+Schoolboy+Q',
  0.886242151260376),
 ('Black+Hippy', 0.8783423900604248),
 ('3-6+Mafia%7CTrillville%7CLord+Infamous%7CProject+Pat', 0.8768203258514404),
 ('MellowHype', 0.875748336315155),
 ('Ace+Hood%2FFuture%2FRick+Ross', 0.8739904761314392),
 ('Kid+Cudi+vs+Crookers', 0.8737793564796448),
 ('Kid+Cudi%2FKing+Chip%2FA$AP+Rocky', 0.8727040886878967),
 ('A$AP+Ferg+feat.+Shabba+Ranks,+Busta+Rhymes+&+Migos', 0.8694416284561157),
 ('MellowHigh+feat.+Earl+Sweatshirt+and+Remy+Banks', 0.8684039115905762),
 ('Pusha+T', 0.8651683926582336),
 ('Oskar+Koch', 0.8645967245101929),
 ('Earl+Sweatshirt,+Matthew+Tavares,+Alex+Sowinski+&+Chester+Handsen',
  0.8623257875442505),
 ('Schoolboy+Q.', 0.8622783422470093),
 ('Franchise', 0.8619042634963989),
 ('Taj-He-Spitz', 0.8605881929397583),
 ('Kendrick+Lamar%2FBJ', 0.8603178858757019),
 ('Asher+Roth', 0.8582335710525513)