# gensim
* 홈페이지 https://radimrehurek.com/gensim/
* word2vev https://radimrehurek.com/gensim/models/word2vec.html

In [1]:
!pip install gensim

[31mdistributed 1.22.0 requires msgpack, which is not installed.[0m
[31mtensorboard 1.8.0 has requirement bleach==1.5.0, but you'll have bleach 2.1.3 which is incompatible.[0m
[31mbleach 2.1.3 has requirement html5lib!=1.0b1,!=1.0b2,!=1.0b3,!=1.0b4,!=1.0b5,!=1.0b6,!=1.0b7,!=1.0b8,>=0.99999999pre, but you'll have html5lib 0.9999999 which is incompatible.[0m


In [2]:
from gensim import corpora, similarities
from gensim.models import Word2Vec

import os
import multiprocessing

input_filename = '/Users/ryanshin/Downloads/namuwiki180326/namuwiki_20180326_mini_pos_tagged_corpus.txt'
model_path = '/Users/ryanshin/Downloads/namuwiki180326/model'

class SentenceReader(object):
    def __init__(self, input_filename):
        self.input_filename = input_filename
    def __iter__(self):
        for line in open(input_filename):
            yield line.split(' ')
            
sentences_vocab = SentenceReader(input_filename) # a memory-friendly iterator
sentences_train = SentenceReader(input_filename) # a memory-friendly iterator

config = {
    'min_count': 10, # 등장 회수가 10 이하인 단어는 무시
    'size': 300, # 300차원짜리 벡터스페이스에 embedding
    'sg': 1, # 0 이면 CBOW, 1이면 skip-gram을 사용한다.
    'batch_words': 10000,  # 사전을 구축할때 한번에 읽을 단어 수
    'iter': 10,  # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수
    'workers': multiprocessing.cpu_count(),
}
word2vec_model = Word2Vec(**config)

In [3]:
token_count = sum([len(sentence) for sentence in sentences_vocab])
print(token_count)

word2vec_model.build_vocab(sentences_vocab)
word2vec_model.train(sentences_train, total_examples = token_count, epochs=word2vec_model.iter)

10113764


  """


(83282582, 101137640)

# CBOW vs Skip-Gram
* CBOW : 데이터 양이 많으면서 문장이 짧을때. 주변 단어를 가지고 가운데 들어갈 단어를 예측.
* Skip-Gram : 데이터 양이 적은 대신 문장이 길때. 단어 하나를 가지고 주변에 나올 단어를 예측

In [4]:
word2vec_model.wv['컴퓨터/Noun']

array([ 1.78576931e-01,  7.49315843e-02, -3.11306894e-01,  1.19309694e-01,
       -6.93457872e-02, -3.79123688e-01, -4.79722619e-02,  2.18156844e-01,
        3.35759819e-01, -2.68509746e-01, -2.53962390e-02, -5.39642945e-02,
       -2.28669476e-02, -2.34709874e-01,  4.06296551e-01, -5.95942259e-01,
       -9.14394557e-02,  1.95941761e-01, -6.05779216e-02,  1.14610098e-01,
       -1.58663094e-01, -1.28940165e-01, -2.04267800e-01, -7.71410912e-02,
        7.36862868e-02, -3.85297276e-02, -2.03342229e-01, -2.00717952e-02,
       -3.30716908e-01,  2.24526748e-02, -1.12944432e-01,  2.43033752e-01,
       -7.16991955e-04, -3.24769169e-01,  9.21481475e-02, -5.10026291e-02,
       -1.29870310e-01, -9.29385349e-02, -3.60544771e-01, -3.88689578e-01,
       -2.72048175e-01,  2.34957650e-01,  1.18865073e-03, -2.16316789e-01,
       -2.88423330e-01, -1.31094605e-01,  2.02253327e-01, -6.49777278e-02,
        3.16089541e-01,  1.80454493e-01,  4.86447103e-03,  1.29069835e-01,
        1.29886597e-01,  

* 컴퓨터와 비슷한 단어 찾기(학습한 모델)

In [5]:
word2vec_model.most_similar(['컴퓨터/Noun'])

  """Entry point for launching an IPython kernel.


[('메모리/Noun', 0.4880021810531616),
 ('주변기기/Noun', 0.4782713055610657),
 ('사운드카드/Noun', 0.47793954610824585),
 ('CMOS/Alpha', 0.4768950045108795),
 ('데스크톱/Noun', 0.4731979966163635),
 ('기계어/Noun', 0.46986082196235657),
 ('데이터베이스/Noun', 0.46964263916015625),
 ('DAW/Alpha', 0.46937817335128784),
 ('하드디스크/Noun', 0.4691876173019409),
 ('모니터/Noun', 0.4667254686355591)]

In [6]:
word2vec_model.most_similar(['오락실/Noun'])

  """Entry point for launching an IPython kernel.


[('게임파크/Noun', 0.5379852652549744),
 ('노량진/Noun', 0.506752610206604),
 ('동구/Noun', 0.4893345832824707),
 ('삼천포/Noun', 0.4800158143043518),
 ('메가박스/Noun', 0.47919347882270813),
 ('게임/Noun', 0.478000670671463),
 ('호점/Noun', 0.47393080592155457),
 ('유성구/Noun', 0.4692496359348297),
 ('상당구/Noun', 0.4685227572917938),
 ('부천시/Noun', 0.4666518568992615)]

In [7]:
word2vec_model.most_similar(positive = ['서울/Noun', '미국/Noun'], negative=['한국/Noun'])

  """Entry point for launching an IPython kernel.


[('영국/Noun', 0.40712958574295044),
 ('댈러스/Noun', 0.39196091890335083),
 ('로스앤젤레스/Noun', 0.3905356526374817),
 ('컬럼비아/Noun', 0.38957080245018005),
 ('국회의사당/Noun', 0.38929930329322815),
 ('교향악단/Noun', 0.38789844512939453),
 ('용산구/Noun', 0.3861015737056732),
 ('장충/Noun', 0.38562703132629395),
 ('캘커타/Noun', 0.38522636890411377),
 ('상암동/Noun', 0.3841337561607361)]

In [8]:
# 모델저장
word2vec_model.save(model_path)

In [14]:
word2vec_model.most_similar(positive = ['철권/Noun','게임/Noun'], negative=['오락/Noun'])

  """Entry point for launching an IPython kernel.


[('엘더스크롤/Noun', 0.4033607840538025),
 ('넥슨/Noun', 0.3769017457962036),
 ('크리드/Noun', 0.35956358909606934),
 ('달심/Noun', 0.3508110046386719),
 ('CAPCOM/Alpha', 0.3502022922039032),
 ('XIII/Alpha', 0.34997621178627014),
 ('싱글플레이/Noun', 0.34686872363090515),
 ('액션게임/Noun', 0.34419918060302734),
 ('GTA/Alpha', 0.3440825939178467),
 ('마블/Noun', 0.34189626574516296)]

In [15]:
word2vec_model.most_similar(['쓰르라미/Noun'])

  """Entry point for launching an IPython kernel.


[('괭이갈매기/Noun', 0.808375895023346),
 ('테일즈런너/Noun\n', 0.666684627532959),
 ('즈미/Noun', 0.6530146598815918),
 ('코믹파티/Noun\n', 0.6499896049499512),
 ('나미/Noun\n', 0.6497930288314819),
 ('크루세이더즈/Noun\n', 0.6486780643463135),
 ('아키/Noun\n', 0.6463680863380432),
 ('세키레이/Noun\n', 0.646142303943634),
 ('스케치북/Noun\n', 0.6440739035606384),
 ('하급생/Noun\n', 0.6438494920730591)]

In [16]:
word2vec_model.most_similar(['도재욱/Noun'])

  """Entry point for launching an IPython kernel.


[('박경락/Noun', 0.7864271998405457),
 ('박용욱/Noun', 0.7656582593917847),
 ('변길섭/Noun', 0.7602831125259399),
 ('이병렬/Noun', 0.7600435018539429),
 ('임정현/Noun', 0.7480770945549011),
 ('박령우/Noun', 0.7472872734069824),
 ('김대엽/Noun', 0.7383952140808105),
 ('최종혁/Noun', 0.7383594512939453),
 ('강도경/Noun', 0.7374047636985779),
 ('장윤철/Noun', 0.734906792640686)]

In [24]:
word2vec_model.most_similar(['손흥민/Noun'])

  """Entry point for launching an IPython kernel.


[('구자철/Noun', 0.7721368074417114),
 ('이청용/Noun', 0.7673439383506775),
 ('기성용/Noun', 0.7145845890045166),
 ('곽태휘/Noun', 0.7007303237915039),
 ('남태희/Noun', 0.6983648538589478),
 ('박주영/Noun', 0.6928110718727112),
 ('이정협/Noun', 0.6916085481643677),
 ('김주영/Noun', 0.6899188756942749),
 ('이근호/Noun', 0.6858675479888916),
 ('김진현/Noun', 0.6849050521850586)]

In [25]:
word2vec_model.most_similar(['밤/Noun'])

  """Entry point for launching an IPython kernel.


[('저녁/Noun', 0.5666530728340149),
 ('아침/Noun', 0.503180205821991),
 ('화요일/Noun', 0.4936125874519348),
 ('낮/Noun', 0.4763809144496918),
 ('보름달/Noun', 0.4653308391571045),
 ('새벽/Noun', 0.4627934396266937),
 ('평일/Noun', 0.4562031030654907),
 ('불빛/Noun', 0.4561612010002136),
 ('정각/Noun', 0.4544278383255005),
 ('밤/Noun\n', 0.4459311366081238)]

In [29]:
# 모델 불러오기
word2vec_model = Word2Vec.load(model_path)