In [1]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

In [2]:
with open('data/train.json',encoding='utf-8-sig') as f:
    train_dict = json.load(f)
    
with open('data/song_meta.json',encoding='utf-8-sig') as f:
    song_dict = json.load(f)
    
with open('data/genre_gn_all.json',encoding='utf-8-sig') as f:
    genre_dict = json.load(f)
    
train_df = pd.DataFrame.from_dict(train_dict)
song_df = pd.DataFrame.from_dict(song_dict)

In [3]:
train_df.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000


In [4]:
train_df['tags_cnt'] = train_df['tags'].map(lambda x : len(x))
train_df.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,tags_cnt
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,1
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,2
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,2
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000,10
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000,1


In [5]:
from itertools import chain
from collections import Counter
tags_all = chain.from_iterable(train_df['tags'].tolist())
tags_unique = list(set(tags_all))
tags_counter = dict(Counter(tags_all))

In [30]:
len(tags_unique)

29160

In [6]:
songs_all = chain.from_iterable(train_df['songs'].tolist())
songs_unique = sorted(list(set(songs_all)))
songs_counter = dict(Counter(songs_all))

In [7]:
songs_unique[:3]

[0, 3, 4]

In [8]:
song_to_id = dict(zip(songs_unique,range(len(songs_unique))))
id_to_song = dict(zip(range(len(songs_unique)),songs_unique))

In [9]:
def song_id_trans(x):
    return [song_to_id[v] for v in x]

train_df['new_song_id'] = train_df['songs'].map(song_id_trans)

In [10]:
train_df.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,tags_cnt,new_song_id
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,1,"[456704, 112732, 333158, 488440, 258853, 12127..."
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,2,"[375894, 587314, 431997, 104605, 338568, 21226..."
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,2,"[72132, 240434, 144495, 161861, 307991, 222934..."
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000,10,"[342495, 169897, 469393, 250235, 383169, 87161..."
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000,1,"[138494, 481095, 4399, 560460, 255877, 87448, ..."


In [11]:
songs_all = chain.from_iterable(train_df['new_song_id'].tolist())
songs_unique = sorted(list(set(songs_all)))
songs_counter = dict(Counter(songs_all))

In [12]:
#  노래id에 달린 태그들을 넣을 딕셔너리 초기화
song_id_to_tags_dict = {i:[] for i in range(len(songs_unique))}

In [13]:
# 태그_id에 달린 장르id들을 list로 넣어주기
for new_song_id , tags in zip(train_df['new_song_id'].tolist(),train_df['tags'].tolist()):
    for id in new_song_id:
        song_id_to_tags_dict[id].extend(tags)
        
song_id_to_tags_dict

{0: ['비오는날', '드라이브', '비오는날', '추억', '회상'],
 1: ['1',
  '감성',
  '듣기좋은',
  '드라이브',
  'Pop',
  '여행',
  '기분전환',
  '팝',
  '신나는',
  '클럽',
  '매장음악',
  '드라이브',
  'EDM모음',
  '페스티벌',
  '기분전환',
  '스트레스',
  '신나는',
  'dance',
  'house',
  '여름',
  'tropical',
  '일렉',
  '편집숍',
  '힙스터',
  '기분전환',
  '운동',
  '축제',
  '클럽',
  '운동',
  'Festival',
  'EDM',
  '불금',
  '페스티벌',
  '파티',
  '즐거운',
  '신나는'],
 2: ['뉴에이지', '뉴에이지'],
 3: ['비오는날',
  '재즈',
  '락',
  '발라드',
  '휴식',
  '리메이크',
  '재즈',
  '힐링송',
  '힐링',
  '기분전환',
  '기분_전환',
  '음악'],
 4: ['클래식', '산책', '조깅'],
 5: ['클래식', '코로나'],
 6: ['위로', '발라드', '감성', '발라드'],
 7: ['재즈힙합',
  '인디',
  '감성',
  '유럽여행',
  '유럽',
  '여행',
  'Chill',
  '감성',
  '밤',
  '편안한',
  'JAYJE',
  '아날로그',
  'Lofi',
  '로파이',
  '공부할때',
  '배경음악',
  'Chill',
  '멜로우비트',
  '재즈',
  '멜로우힙합',
  'Lofihiphop',
  '재즈힙합',
  '빈티지',
  '빈티지감성',
  '로파이',
  'Lofi',
  '새벽감성',
  'Lofihiphop',
  '로파이힙합',
  '로파이',
  'Summer',
  '여름',
  'Pop',
  '힙합',
  '마이너감성',
  '재즈',
  'Lofi',
  '재즈힙합',
  '로파이',
  '휴식',
  '알앤비',
  '그루브

In [14]:
tags_result = list(song_id_to_tags_dict.values())
tags_result[0:3]

[['비오는날', '드라이브', '비오는날', '추억', '회상'],
 ['1',
  '감성',
  '듣기좋은',
  '드라이브',
  'Pop',
  '여행',
  '기분전환',
  '팝',
  '신나는',
  '클럽',
  '매장음악',
  '드라이브',
  'EDM모음',
  '페스티벌',
  '기분전환',
  '스트레스',
  '신나는',
  'dance',
  'house',
  '여름',
  'tropical',
  '일렉',
  '편집숍',
  '힙스터',
  '기분전환',
  '운동',
  '축제',
  '클럽',
  '운동',
  'Festival',
  'EDM',
  '불금',
  '페스티벌',
  '파티',
  '즐거운',
  '신나는'],
 ['뉴에이지', '뉴에이지']]

In [15]:
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count()

In [16]:
w2v_model = Word2Vec(min_count=10,
                     window=3,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [17]:
w2v_model.build_vocab(tags_result, progress_per=10000)

In [18]:
from time import time

t = time()
w2v_model.train(tags_result, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 5.91 mins


In [19]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [24]:
w2v_model.wv.most_similar(positive=["락"],topn=5)

[('록', 0.5531038045883179),
 ('밴드', 0.5198248624801636),
 ('팝', 0.48328331112861633),
 ('Rock', 0.4813997745513916),
 ('사실락만있는거아님', 0.45989060401916504)]

In [23]:
w2v_model.wv.most_similar(positive=["기분좋은"],topn=5)

[('행복', 0.5596479773521423),
 ('데이트', 0.5584471821784973),
 ('봄', 0.5303997993469238),
 ('달달', 0.5176151990890503),
 ('달달한', 0.5170160531997681)]

In [25]:
w2v_model.wv.most_similar(positive=["느긋한"],topn=5)

[('심심한', 0.5409355759620667),
 ('한가한', 0.5254039764404297),
 ('느긋느긋', 0.4906960725784302),
 ('나긋나긋', 0.4282887578010559),
 ('잔잔한그루브', 0.41279932856559753)]

In [36]:
len(w2v_model.wv)

28733

In [40]:
word_vectors = w2v_model.wv

In [53]:
word_vectors.key_to_index['락']

36

In [45]:
word_vectors.vectors[0:3]

array([[ 3.81622277e-02, -1.14798866e-01,  1.28130704e-01,
        -4.91289385e-02, -5.75583288e-03,  7.88114369e-02,
         2.82828938e-02,  4.22432609e-02,  5.75586110e-02,
         5.10748960e-02,  1.03406971e-02,  8.57046098e-02,
        -7.03828931e-02,  1.02196865e-01,  2.15622596e-02,
         4.25093248e-02, -7.51991346e-02,  1.76750403e-02,
         3.00971642e-02,  3.45339626e-02, -2.65563056e-02,
        -7.60709401e-03, -4.86635230e-02,  2.51385476e-02,
        -6.43797442e-02,  3.97076607e-02, -6.45420030e-02,
        -3.96940410e-02,  4.20970470e-02,  6.62545580e-03,
        -9.16856341e-03, -5.01071662e-02, -2.33782381e-02,
        -6.81793094e-02, -4.57443967e-02, -7.34456405e-02,
         1.40392510e-02,  4.49817963e-02, -7.61924917e-03,
        -6.92717209e-02, -3.90361473e-02, -2.10036244e-02,
         6.92797080e-02, -5.06780185e-02, -4.12618928e-03,
        -2.88148895e-02, -1.01817027e-01, -6.05974570e-02,
        -3.84988040e-02,  7.30151427e-04, -3.50716487e-0

In [46]:
from sklearn import cluster
from sklearn import metrics

kmeans = cluster.KMeans(n_clusters=100)
kmeans.fit(word_vectors.vectors)

KMeans(n_clusters=100)

In [49]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
print ("Centroids data")
print (centroids)
 
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(word_vectors.vectors))
 
silhouette_score = metrics.silhouette_score(word_vectors.vectors, labels, metric='euclidean')
 
print ("Silhouette_score: ")
print (silhouette_score)

Cluster id labels for inputted data
[86 86 86 ... 80 80 38]
Centroids data
[[-1.4229143e-02  5.4488074e-02 -2.9263087e-02 ... -4.0387928e-02
  -6.1845523e-03  2.7498739e-02]
 [-3.6121428e-02  5.3280719e-02  1.0692533e-03 ... -2.9216185e-02
  -9.8252585e-03 -5.8435071e-02]
 [-1.3149026e-03  7.2861195e-02 -2.5657862e-02 ... -8.4208325e-05
  -2.5520120e-02  2.3659438e-02]
 ...
 [-5.2355332e-03  1.4409461e-02 -3.6562506e-02 ... -1.5763134e-02
  -1.5947366e-02  4.6983985e-03]
 [-1.1164323e-02  3.6268219e-02 -7.9611905e-02 ...  3.7073713e-02
  -3.6960125e-02  1.3007020e-02]
 [ 1.1379392e-02  7.3280573e-02 -1.9985575e-02 ... -2.4762683e-02
  -7.8482740e-04 -4.8240643e-02]]
Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):
-22674.505859375
Silhouette_score: 
0.004092729


In [52]:
word_vectors.get_vecattr()

TypeError: get_vecattr() missing 1 required positional argument: 'attr'

In [None]:
kmeans.predict()