In [46]:
import pandas as pd
from tqdm import tqdm
import gensim
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity


In [2]:
class SongSentence(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for line in tqdm(open(self.dirname)):
            line = list(map(lambda x: str(songid_to_name.get(x, None)), line.split()))
            line = [x for x in line if x is not None]
            yield line

In [2]:
data_member = pd.read_csv("./data/members.csv")
data_song_info = pd.read_csv("./data/song_extra_info.csv")
data_song = pd.read_csv("./data/songs.csv")
data_listen = pd.read_csv("./data/train.csv")

data_song = pd.merge(data_song, data_song_info, on="song_id", how="left")

In [50]:
data_song.shape

(2296320, 9)

In [47]:
set(data_listen.source_system_tab)

{nan,
 'discover',
 'search',
 'listen with',
 'notification',
 'explore',
 'radio',
 'my library',
 'settings'}

In [4]:
songid_to_name = dict(zip(data_song.song_id, data_song.name))

In [7]:
song2vec = gensim.models.Word2Vec.load("song2vec.model")

In [10]:
song_list = list(song2vec.wv.vocab.keys())

In [37]:
def get_songvec(song_id):
    name = songid_to_name.get(song_id, None)
    if name:
        if name in song_list:
            return song2vec.wv[name]
    return [np.nan]*100

In [6]:
date_columns = ['expiration_date', 'registration_init_time']
user_data = pd.read_csv('./data/members.csv', parse_dates=date_columns)
user_data.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,2011-08-20,2017-09-20
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,,7,2015-06-28,2017-06-22
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,,4,2016-04-11,2017-07-12
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,,9,2015-09-06,2015-09-07
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,,4,2017-01-26,2017-06-13


In [7]:
sentences = SongSentence("./files/song_corpus.txt") # a memory-friendly iterator

songid_to_name = dict(zip(data_song.song_id, data_song.name))

In [8]:
model = gensim.models.Word2Vec(sentences, workers=4)

652092it [00:17, 37434.81it/s]
652092it [00:22, 29034.28it/s]
652092it [00:20, 31310.44it/s]
652092it [00:22, 29327.13it/s]
652092it [00:22, 28683.14it/s]
652092it [00:23, 27454.75it/s]


You can run the following cell to see the result

In [9]:
input_name = input()
print("similar songs   |   similarity")
try:
    for song in model.wv.most_similar(input_name):
        print("{}   |   {}".format(song[0], song[1]))
except KeyError as e:
    print("the song is not in the song2vec vocabulary")

晴天
similar songs   |   similarity
算什麼男人   |   0.8898013234138489
聽見下雨的聲音   |   0.86936354637146
牽心萬苦   |   0.8036636114120483
手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋   |   0.7750793099403381
你他我   |   0.7747445106506348
我還是愛著你【三立華劇[幸福兌換券]片尾曲】   |   0.7680702209472656
這是最後一次 (This Is The Last Time)   |   0.7650311589241028
多遠都要在一起 (Long Distance)   |   0.7604715824127197
兜圈   |   0.7575459480285645
別說沒愛過 (Dont' Say)   |   0.7567638158798218


In [11]:
model.save("song2vec.model")

In [12]:
model2 = gensim.models.Word2Vec.load("song2vec.model")

#### It seems the song2vec result is OK. We can do some 2D visualization later. How about Artist2vec?

In [15]:
# create artist to song dictionary
artist_song_dict = dict()
for artist, song in tqdm(zip(data_song.artist_name.values, data_song.name.values)):
    artist_song_dict[artist] = artist_song_dict.get(artist, []) + [song]

2296320it [06:51, 5581.89it/s]


In [16]:
artist2vec = dict()
no_song_count = 0
for artist in art2vec_keys:
    songs = artist_song_dict[artist]
    song_vectors = list(map(lambda x: model.wv[x] if x in model.wv else None, songs))
    song_vectors = [x for x in song_vectors if x is not None]
    if len(song_vectors) == 0:
        no_song_count += 1
    else:
        artist2vec[artist] = sum(song_vectors) / len(song_vectors)

In [19]:
import pickle
with open("artist2vec.txt", "wb") as f:
    pickle.dump(artist2vec, f)

In [52]:
artist_set = list(artist2vec.keys())
print("finally we obtain vectors for {} artists".format(len(artist_set)))

finally we obtain vectors for 32114 artists


In [54]:
data_song[data_song.name == ""]

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc
95276,swwNe+zgflQtic46OQ3ytU1QvNf71XRZzvCHSwAqcto=,187141,465,牟茗,,,3.0,南山南,QMDA61534977
167421,/Sx6lLX2XhU/myW3/XtW6KLZmFdMZWNecA3cyJfpNVs=,324963,829|458,馬頔 (Di Ma),馬頔,馬頔,3.0,南山南,CNA231402073
1203597,TrA3A27qJFi7axOXvN+zypkjhACz7CBCL7qVzVvC4Qc=,167497,458,陳信喆,堯十三,馬頔,3.0,南山南,TWU711601656


In [17]:
cosine_similarity([artist2vec["馬頔 (Di Ma)"], artist2vec["宋冬野 (Dongye Song)"], artist2vec["Lady Gaga"]
                  , artist2vec["周杰倫 (Jay Chou)"], artist2vec["林俊傑 (JJ Lin)"]])

array([[ 1.        ,  0.36542746,  0.20905924, -0.02671132,  0.09035478],
       [ 0.36542746,  1.0000001 ,  0.13714375,  0.4680647 ,  0.41523182],
       [ 0.20905924,  0.13714375,  1.0000001 ,  0.24226278,  0.12728406],
       [-0.02671132,  0.4680647 ,  0.24226278,  0.9999999 ,  0.4938362 ],
       [ 0.09035478,  0.41523182,  0.12728406,  0.4938362 ,  1.        ]],
      dtype=float32)

In [64]:
cosine_similarity([artist2vec["馬頔 (Di Ma)"], artist2vec["周杰倫 (Jay Chou)"]])[0][1]

-0.02729522

In [68]:
cosine_similarity([artist2vec["馬頔 (Di Ma)"], artist2vec["Lady Gaga"]])[0][1]

0.22212684

In [70]:
cosine_similarity([artist2vec["林俊傑 (JJ Lin)"], artist2vec["周杰倫 (Jay Chou)"]])[0][1]

0.48841667