In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy import sparse
import gensim
import tqdm

### reformat data

In [None]:
%%time
class DatasetToW2V(object):
    def __init__(self, file_path):
        with open(file_path, 'rb') as f:
            dataset = pickle.load(f)
            self.dataset = sparse.csr_matrix(dataset.to_coo())
            self.names = dataset.columns

    def __iter__(self):
        for user_items in tqdm.tqdm(self.dataset):
            indxs = user_items.nonzero()[1]
            yield list(self.names[indxs])


music_collections = DatasetToW2V('../data/final.pkl')

### fit model

In [None]:
%%time
estimator = gensim.models.Word2Vec(music_collections,
                                   min_count=30,  # number of users who read book
                                   workers=4,
                                  )

In [None]:
with open('../data/model_w2v_names.pkl', 'wb') as f:
    pickle.dump(estimator, f)

### test

In [2]:
estimator = gensim.models.Word2Vec.load('../data/model.w2v')

In [25]:
with open('../data/model_w2v.pkl', 'wb') as f:
    pickle.dump(estimator, f)

In [26]:
ls -lht ../data/

total 7784968
-rw-r--r--   1 alex  staff    32M Oct 20 18:39 model_w2v.pkl
-rw-r--r--   1 alex  staff    22M Oct 20 13:33 model_w2v_names.pkl
-rw-r--r--@  1 alex  staff    31M Oct 20 13:22 model.w2v
drwxr-xr-x@  5 alex  staff   160B Jan 30  2018 [1m[36mlastfm-dataset-360K[m[m/
-rw-r--r--   1 alex  staff   260B Jan 26  2018 dataset_sources.txt
-rw-r--r--   1 alex  staff   243M Jan 21  2018 model.pkl
drwxr-xr-x  99 alex  staff   3.1K Jan 21  2018 [1m[36mtmp[m[m/
-rw-r--r--   1 alex  staff   516M Jan 21  2018 final.pkl
-rw-r--r--@  1 alex  staff   2.8G Dec 19  2011 train_triplets.txt
-rw-r--r--@  1 alex  staff    80M Jan 25  2011 unique_tracks.txt


In [16]:
r = estimator.predict_output_word(['Flipsyde', 'Disturbed', 'Eminem'])

In [23]:
[a for a, _ in r]

['Eminem / Dina Rae',
 'Eminem / Hailie Jade',
 'Eminem / Dr. Dre / 50 Cent',
 'Eminem / Nate Dogg',
 'Eminem / Bizarre',
 'Eminem / DMX / Obie Trice',
 'Eminem / Dr. Dre',
 'Dr. Dre / Eminem',
 'Eminem / Royce Da 5-9',
 'Evanescence']

In [21]:
np.array(r)[:2, 0]

array(['Eminem / Dina Rae', 'Eminem / Hailie Jade'], dtype='<U26')

In [9]:
for i in estimator.vocabulary:
    print(i)

TypeError: 'Word2VecVocab' object is not iterable