In [57]:
import pandas as pd
from tqdm import tqdm
import gensim

from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import matplotlib.style as style
style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
class SongSentence(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for line in tqdm(open(self.dirname)):
            line = list(map(lambda x: str(songid_to_name.get(x, None)), line.split()))
            line = [x for x in line if x is not None]
            yield line

In [4]:
data_member = pd.read_csv("./data/members.csv")
data_song_info = pd.read_csv("./data/song_extra_info.csv")
data_song = pd.read_csv("./data/songs.csv")
data_listen = pd.read_csv("./data/train.csv")

data_song = pd.merge(data_song, data_song_info, on="song_id", how="left")

In [83]:
data_song

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0,焚情,TWB531410010
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0,PLAYING WITH FIRE,
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0,SORRY| SORRY,
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0,愛我的資格,TWC950206108
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0,Mary Had a Little Lamb,
5,kKJ2JNU5h8rphyW21ovC+RZU+yEHPM+3w85J37p7vEQ=,235520,864|857|850|843,貴族精選,Joe Hisaishi,Hayao Miyazaki,17.0,となりのトトロ,
6,N9vbanw7BSMoUgdfJlgX1aZPE1XZg8OS1wf88AQEcMc=,226220,458,伍佰 & China Blue,Jonathan Lee,,3.0,夢醒時分,TWH951100012
7,GsCpr618xfveHYJdo+E5SybrpR906tsjLMeKyrCNw8s=,276793,465,光良 (Michael Wong),光良,彭資閔,3.0,記得我愛你,TWA450582110
8,oTi7oINPX+rxoGp+3O6llSltQTl80jDqHoULfRoLcG4=,228623,465,林俊傑 (JJ Lin),JJ Lin,Wu Qing Feng,3.0,裂縫中的陽光 (Before Sunrise),TWA531398021
9,btcG03OHY3GNKWccPP0auvtSbhxog/kllIIOx5grE/k=,232629,352|1995,Kodaline,Stephen Garrigan| Mark Prendergast| Vincent Ma...,Stephen Garrigan| Mark Prendergast| Vincent Ma...,52.0,The One,GBARL1401580


In [39]:
sentences = SongSentence("./files/song_corpus.txt") # a memory-friendly iterator

songid_to_name = dict(zip(data_song.song_id, data_song.name))

In [40]:
model = gensim.models.Word2Vec(sentences, workers=4)

652092it [00:15, 41655.10it/s]
652092it [00:21, 30207.83it/s]
652092it [00:21, 30789.60it/s]
652092it [00:21, 30433.21it/s]
652092it [00:21, 30156.67it/s]
652092it [00:21, 30563.51it/s]


In [31]:
input_name = input()
print("similar songs   |   similarity")
try:
    for song in model.wv.most_similar(input_name):
        print("{}   |   {}".format(song[0], song[1]))
except KeyError as e:
    print("the song is not in the song2vec vocabulary")

晴天
similar songs   |   similarity
算什麼男人   |   0.8919458389282227
聽見下雨的聲音   |   0.8635944128036499
牽心萬苦   |   0.7978859543800354
你他我   |   0.7772645354270935
手心的薔薇 (Beautiful) feat. G.E.M.鄧紫棋   |   0.7686334848403931
我還是愛著你【三立華劇[幸福兌換券]片尾曲】   |   0.767096757888794
這是最後一次 (This Is The Last Time)   |   0.7652390003204346
別說沒愛過 (Dont' Say)   |   0.7619626522064209
兜圈   |   0.7534346580505371
多遠都要在一起 (Long Distance)   |   0.7526910305023193


In [78]:
model.wv.vocab.keys()



#### It seems the song2vec result is OK. We can do some 2D visualization later. How about Artist2vec?

In [41]:
# can remove this part actually
artist_count = data_song.artist_name.value_counts()
art2vec_keys = list(artist_count[artist_count.values >= 10].index)

In [37]:
# create artist to song dictionary
artist_song_dict = dict()
for artist, song in tqdm(zip(data_song.artist_name.values, data_song.name.values)):
    artist_song_dict[artist] = artist_song_dict.get(artist, []) + [song]

2296320it [07:00, 5455.70it/s]


In [38]:
artist_song_dict["梁靜茹 (Fish Leong)"]

['第三者',
 '最爛的理由',
 '天燈',
 '序',
 '101',
 '等我跌倒',
 '對不起我愛你',
 '向左轉向右轉',
 '愛你不是兩三天',
 '彩虹',
 '給還沒有遇見的你',
 '風笛手',
 '我還記得',
 '快樂一整天',
 '小愛情',
 '我是真的愛你',
 '分手快樂',
 '只想單純在一起',
 '如果有一天',
 '我們就到這',
 'Way back into Love',
 '分手快樂',
 '屬於',
 '如果有一天',
 '小手拉大手',
 '給從前的愛',
 '最後',
 '愛久見人心',
 '瘦瘦的',
 '幸福洋菓子店',
 'PK',
 '只能抱著你',
 '我是愛你的',
 '孤單北半球',
 '原來你也唱過我的歌',
 '親親',
 '下一秒鐘',
 '勇氣',
 '情歌',
 '分手快樂',
 '直覺',
 'Whoever Finds This，I Love You',
 '昨天',
 '敗犬女王',
 '愛計較',
 'C’est La Vie',
 '中間',
 '我喜歡',
 'Talking+勇氣',
 '會呼吸的痛',
 '只能抱著你',
 'Talking+無條件為你',
 '花錢找男人',
 '如果有一天+Talking',
 '別人的天長地久',
 '親親',
 '一家一',
 '三吋日光',
 '愛很簡單',
 '昨日情書',
 '如果冰箱會說話',
 '我和自己的約會',
 '情歌沒有告訴你',
 '迷路',
 'LA LA LA LA (La La La La)',
 '喜悅',
 'Sunrise',
 '一個人淋雨',
 "C'est la vie",
 '憨過頭',
 '我都知道',
 '這一天 我們都健康年輕',
 '崇拜',
 '為我好',
 '可惜不是你',
 '美麗人生',
 '可以的話',
 '聽不到',
 '南海姑娘+Talking',
 '要條件要感覺',
 '生命中不可承受的輕',
 '分手快樂',
 '至少愛',
 '沒有水的游泳池',
 '用力抱著',
 '愛情之所以為愛情',
 '為你而:P',
 '旅程',
 '滿滿的都是愛',
 '有你在',
 '茉莉花',
 '絲路',
 '很久以後',
 '小心眼',
 'Talking+為我好',
 '我決

In [51]:
artist2vec = dict()
no_song_count = 0
for artist in art2vec_keys:
    songs = artist_song_dict[artist]
    song_vectors = list(map(lambda x: model.wv[x] if x in model.wv else None, songs))
    song_vectors = [x for x in song_vectors if x is not None]
    if len(song_vectors) == 0:
        no_song_count += 1
    else:
        artist2vec[artist] = sum(song_vectors) / len(song_vectors)

In [52]:
artist_set = list(artist2vec.keys())
print("finally we obtain vectors for {} artists".format(len(artist_set)))

finally we obtain vectors for 32114 artists


In [53]:
artist_set

['Ty Segall',
 'Yangpa',
 'J. Ralph',
 'Gotthard',
 'Spooman',
 'Yuri Temirkanov',
 'The King Blues',
 'Yall',
 'Oleg Kagan',
 'The Ultimate Storyteller',
 'Bloodsimple',
 'JoJo',
 'Ra Ra Riot',
 'Hip Hop All-Stars| Instrumental Hip Hop Beats Crew| The Hip Hop Nation',
 'Rebel',
 'Dj Pablo',
 'Erik Berglund',
 'spa relaxation',
 'Nobuyuki Nakajima',
 'Al Hirt & Strings',
 '最幸福電視原聲帶_遇見幸福300天',
 '李克勤 (Hacken Lee)',
 'Chorok Table',
 "St. Paul's Cathedral Choir",
 'Disco Fever| Music Factory',
 'Mario Vinuela',
 'Guitarra Clásica Española| Spanish Classic Guitar| Spanish Guitar',
 'Camille Saint-Saens',
 'Eugene Yu',
 'Cookie Duster',
 'St Leonards',
 'John Lee Hooker | Muddy Waters |B.B King |Ray Charles',
 '阿雅 (Aya)',
 'Kids Praise Kids',
 'Piso 21',
 'Kobushi Factory (こぶしファクトリー)',
 '陳予新 (Cindy Chen)',
 'Jabberwocky',
 'Gabrielle',
 'Hope',
 'John McLaughlin',
 'Katie Melua',
 'Plug In Stereo',
 'Percy Faith & His Orchestra',
 'Relaxation',
 'Wax Tailor',
 'Recondite',
 '10|000 Maniacs'

In [54]:
data_song[data_song.name == ""]

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc
95276,swwNe+zgflQtic46OQ3ytU1QvNf71XRZzvCHSwAqcto=,187141,465,牟茗,,,3.0,南山南,QMDA61534977
167421,/Sx6lLX2XhU/myW3/XtW6KLZmFdMZWNecA3cyJfpNVs=,324963,829|458,馬頔 (Di Ma),馬頔,馬頔,3.0,南山南,CNA231402073
1203597,TrA3A27qJFi7axOXvN+zypkjhACz7CBCL7qVzVvC4Qc=,167497,458,陳信喆,堯十三,馬頔,3.0,南山南,TWU711601656


In [81]:
cosine_similarity([artist2vec["馬頔 (Di Ma)"], artist2vec["宋冬野 (Dongye Song)"], artist2vec["Lady Gaga"]
                  , artist2vec["周杰倫 (Jay Chou)"], artist2vec["林俊傑 (JJ Lin)"]])

array([[ 0.9999997 ,  0.388234  ,  0.22212684, -0.02729522,  0.05872068],
       [ 0.388234  ,  1.0000002 ,  0.12477301,  0.45808777,  0.40064442],
       [ 0.22212684,  0.12477301,  0.9999998 ,  0.25038624,  0.12246218],
       [-0.02729522,  0.45808777,  0.25038624,  0.99999994,  0.48841667],
       [ 0.05872068,  0.40064442,  0.12246218,  0.48841667,  1.0000002 ]],
      dtype=float32)

In [64]:
cosine_similarity([artist2vec["馬頔 (Di Ma)"], artist2vec["周杰倫 (Jay Chou)"]])[0][1]

-0.02729522

In [68]:
cosine_similarity([artist2vec["馬頔 (Di Ma)"], artist2vec["Lady Gaga"]])[0][1]

0.22212684

In [70]:
cosine_similarity([artist2vec["林俊傑 (JJ Lin)"], artist2vec["周杰倫 (Jay Chou)"]])[0][1]

0.48841667