In [None]:
import pickle
import numpy as np
import pandas as pd
from scipy import sparse
import gensim
import tqdm

### reformat data

In [None]:
%%time
class DatasetToW2V(object):
    def __init__(self, file_path):
        with open(file_path, 'rb') as f:
            dataset = pickle.load(f)
            self.dataset = sparse.csr_matrix(dataset.to_coo())
            self.names = dataset.columns

    def __iter__(self):
        for user_items in tqdm.tqdm(self.dataset):
            indxs = user_items.nonzero()[1]
            yield list(self.names[indxs])


music_collections = DatasetToW2V('../data/final.pkl')

### fit model

In [None]:
%%time
estimator = gensim.models.Word2Vec(music_collections,
                                   window=10
                                   min_count=30,  # number of users who read book
                                   sg=1,
                                   workers=4,
                                   iter=30,
                                  )

In [None]:
with open('../data/model_w2v.pkl', 'wb') as f:
    pickle.dump(estimator, f)

### add band popularity

In [None]:
singers = estimator.wv.vocab.keys()
popularity = {}

for singer in singers:
    popularity[singer] = estimator.wv.vocab[singer].count

with open('../data/popularity.pkl', 'wb') as f:
    pickle.dump(popularity, f)