In [25]:
import pandas as pd
import numpy as np
import scipy
import gensim
import time
from gensim.models import word2vec
from konlpy.tag import Kkma, Twitter
t = Twitter()

In [77]:
t.tagset

{'Adjective': '형용사',
 'Adverb': '부사',
 'Alpha': '알파벳',
 'Conjunction': '접속사',
 'Determiner': '관형사',
 'Eomi': '어미',
 'Exclamation': '감탄사',
 'Foreign': '외국어, 한자 및 기타기호',
 'Hashtag': '트위터 해쉬태그',
 'Josa': '조사',
 'KoreanParticle': '(ex: ㅋㅋ)',
 'Noun': '명사',
 'Number': '숫자',
 'PreEomi': '선어말어미',
 'Punctuation': '구두점',
 'ScreenName': '트위터 아이디',
 'Suffix': '접미사',
 'Unknown': '미등록어',
 'Verb': '동사'}

In [11]:
df = pd.read_csv('df.csv')

## Building Word2Vec with Korean Lyric Data

In [78]:
results = []
start_time = time.time()
i = 0

for lyric in df['lyric']:
    i = i + 1
    lyric_pos = t.pos(lyric, norm=True, stem=True)
    token=[]
    
    for word in lyric_pos:
        if word[1] in ["Noun", "Verb", "Adjective", "Adverb"]:
            token.append(word[0])
    lyric_txt = (" ".join(token)).strip()
    results.append(lyric_txt)
    if i % 1000 == 0:
        process_time = time.time() - start_time
        print(i, 'th process time = %.3f secs' % (process_time))
    #print(lyric_txt)

1000 th process time = 15.064 secs
2000 th process time = 30.827 secs
3000 th process time = 45.972 secs
4000 th process time = 62.040 secs
5000 th process time = 79.437 secs
6000 th process time = 96.627 secs


In [94]:
data_file = "lyric_corpus.data"
with open(data_file, 'w', encoding='utf-8') as fp:
    fp.write("\n".join(results))

In [80]:
data = word2vec.LineSentence(data_file)
embedding_model = word2vec.Word2Vec(data, size=200, window = 5, min_count=50, workers=2, iter=100, sg=1)  
embedding_model.save('lyric_w2b.model')

#### Word2Vec argument explained


* size: 포스태깅 된 컨텐츠를 몇 차원의 벡터로 변환할 것인지  
* window: 앞 뒤로 몇 개씩 볼 것인지  
* min_count: 코퍼스 내 출현 빈도가 min_count 미만인 것은 제외  
* workers: 사용하는 코어 갯수  
* iter: 학습 횟수  
* sg: CBOW(0)와 skip-gram(1) 중 선택  

In [55]:
embedding_model.wv.most_similar(positive=["사랑"], topn=10) # this is similarity in L2 norm

[('이별', 0.49479424953460693),
 ('아프다', 0.4455459713935852),
 ('하다', 0.44501277804374695),
 ('말', 0.43216997385025024),
 ('아끼다', 0.37557464838027954),
 ('영원하다', 0.3679729104042053),
 ('헤어지다', 0.35831543803215027),
 ('미워하다', 0.35714852809906006),
 ('정말', 0.3563750982284546),
 ('후회', 0.35318320989608765)]

In [54]:
embedding_model.wv.most_similar(positive=["봄"], topn=10)

[('겨울', 0.4736689329147339),
 ('꽃', 0.4298463463783264),
 ('피다', 0.3984295725822449),
 ('계절', 0.38550102710723877),
 ('벚꽃', 0.3690962791442871),
 ('가을', 0.36882591247558594),
 ('따뜻하다', 0.3416163921356201),
 ('여름', 0.33199405670166016),
 ('설레다', 0.3315635919570923),
 ('핀', 0.3310455083847046)]

In [53]:
# everything related to season except Winter
w1 = ["봄",'여름','가을']
w2 = ['겨울']
embedding_model.wv.most_similar (positive=w1, negative=w2, topn=10)

[('꽃', 0.3980866074562073),
 ('계절', 0.3328343629837036),
 ('바람', 0.31058329343795776),
 ('코', 0.3080652356147766),
 ('핀', 0.3000771999359131),
 ('벚꽃', 0.2998371720314026),
 ('돌다', 0.2986708879470825),
 ('피다', 0.2936900854110718),
 ('낙엽', 0.28930819034576416),
 ('BABY', 0.28138700127601624)]

In [51]:
# finding the odd one
embedding_model.wv.doesnt_match(["침대","사랑","이별"])

'침대'

In [95]:
# save the weight matrix as 'lyric_w2b_result.txt'
embedding_model.wv.save_word2vec_format('lyric_w2b_result.txt')

## Building Term-Binary Matrix

#### Term-Document Matrix (TDM)

: each corpus is represented as a row and documents as columns

* Term-Frenquency Matrix
* Term-Binary Matrix


#### Document-Term Matrix (DTM)

: transpose of TDM so each document is a row ans each word is a column

In [64]:
lyric = df['lyric']

In [65]:
# transform each document to pos dictionary
def split(post):
        splitted = t.pos(post, norm=True, stem=True)
        trimmed_splitted = [n for n, tag in splitted if tag == 'Noun' or tag == 'Verb' or tag == 'Adjective' or tag == 'Adverb']
        return trimmed_splitted
    
# figure out how many times each word occurs
def count_occurrences(post, term):
    try:
        post[term] += 1
    except KeyError as e:
        # if there's no key in dictionary format {key: value}, add new key
        post[term] = 1
    return

In [None]:
count_corpus = {}

for (k,v) in sorted(lyric.items()):    
    splitted_post = split(v)
    store = {}
    for term in splitted_post:
        count_occurrences(store, term)
    count_corpus[k] = store

In [66]:
print(len(count_corpus)) # total number of songs

6037


In [67]:
# Term Frequency matrix
tf_matrix = pd.DataFrame(count_corpus).fillna(0) # 21116 rows (terms) × 6037 columns (documents)
tf_matrix # 21116 rows × 6037 columns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6027,6028,6029,6030,6031,6032,6033,6034,6035,6036
가,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가가,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가감,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가게,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가격,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가격표,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가결,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가고일,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가구,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
가글,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
import numpy as np
tf_matrix['count'] = tf_matrix.sum(axis=1)
tf_matrix = tf_matrix[tf_matrix['count'] >= 50] # word@vec의 weight 크기와 맞추기 위해
len(tf_matrix)

In [103]:
tf_matrix.shape

(1952, 6038)

In [92]:
# To check the presence of a specific word in each document
binary_matrix = scipy.sign(tf_matrix)
binary_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6028,6029,6030,6031,6032,6033,6034,6035,6036,count
가게,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
가기,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
가까워지다,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
가까이,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
가깝다,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
가끔,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
가나,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
가내,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
가다,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
가도,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [93]:
binary_matrix.to_csv('dtm.csv')

## Getting Scores by Document

In [99]:
weight = pd.read_csv('lyric_w2b_result.txt', header = None, delimiter=" ", skiprows=1)
weight.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,하다,-0.039003,-0.065523,0.110325,-0.021174,-0.010616,0.036855,-0.083747,-0.026479,-0.262602,...,0.040909,-0.013998,-0.065067,0.021941,0.162499,-0.063254,0.050513,-0.165895,-0.170395,-0.029509
1,너,0.004102,0.012946,0.098809,0.199014,0.041821,0.016565,0.098897,0.102164,-0.030896,...,-0.022967,0.174935,-0.044151,0.100858,0.017879,0.118563,-0.13373,-0.346831,-0.261436,-0.071432
2,내,-0.011916,-0.068657,0.135732,-0.037444,-0.101359,-0.145474,0.02379,-0.096634,0.04413,...,-0.005168,-0.021905,0.166041,0.135467,-0.126927,0.154657,-0.063973,-0.119855,-0.144186,-0.079829
3,사랑,-0.131322,-0.039075,-0.159848,-0.034248,-0.008182,0.038198,-0.124286,-0.10109,-0.206545,...,0.060518,-0.026009,-0.006136,0.056102,-0.14394,0.13433,-0.039372,-0.024839,-0.391033,0.180343
4,없다,0.06651,-0.024071,0.023928,0.03088,0.057526,0.227692,-0.047973,-0.215295,0.160763,...,-0.042296,-0.019305,0.0765,0.213532,0.249257,0.216945,0.046343,0.104557,-0.070518,0.074968


In [102]:
weight.shape

(1952, 201)

In [105]:
tf_matrix.shape

(1952, 6038)