<br></br>
# **gensim | doc2vec**
<br></br>
## **1 네이버 단어 전처리**
네이버 영화리뷰 단어모델 만들기

In [1]:
from konlpy.tag import Okt
twitter = Okt()

def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    from random import randint
    random_data = [data[randint(1, len(data))]  for no in range(int(len(data)/10)) ]
    return random_data

def tokenize(doc):
    return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

In [2]:
%%time
from collections import namedtuple
train_data        = read_data('data/ratings_train.txt')
train_docs        = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
TaggedDocument    = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]

CPU times: user 50.2 s, sys: 294 ms, total: 50.5 s
Wall time: 39.9 s


In [3]:
from pprint import pprint
pprint(tagged_train_docs[0])

TaggedDocument(words=['가위손/Noun', '을/Josa', '가지다/Verb', "'/Punctuation", '특수하다/Adjective', "'/Punctuation", '사람/Noun', '을/Josa', '묘사/Noun', '하다/Verb', '보다/Verb', '내내/Noun', '사랑/Noun', '에/Josa', '서툴다/Adjective', "'/Punctuation", '평범하다/Adjective', "'/Punctuation", '사람/Noun', '들/Suffix', '이/Josa', '많이/Adverb', '생각나다/Verb'], tags=['1'])


<br></br>
## **2 doc2vec 파라미터 설정 및 학습**

In [4]:
%%time
from gensim.models import doc2vec
doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)

for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs, 
                         total_examples = doc_vectorizer.corpus_count, 
                         epochs = doc_vectorizer.epochs)
    doc_vectorizer.alpha -= 0.002
    doc_vectorizer.min_alpha = doc_vectorizer.alpha 

# 학습이 완료된 모델의 데이터를 저장한다
doc_vectorizer.save('data/doc2vec.model')

CPU times: user 55.4 s, sys: 3.41 s, total: 58.8 s
Wall time: 28.9 s


<br></br>
## **3 doc2Vec 모델활용**

In [5]:
from gensim.models import doc2vec
from pprint import pprint
doc_vectorizer = doc2vec.Doc2Vec.load('data/doc2vec.model')

In [6]:
pprint(doc_vectorizer.wv.most_similar('공포/Noun'))

[('스릴러/Noun', 0.4632594585418701),
 ('호러/Noun', 0.4518583416938782),
 ('복수/Noun', 0.449480801820755),
 ('자유/Noun', 0.41642245650291443),
 ('임팩트/Noun', 0.3882853090763092),
 ('어/Noun', 0.38356247544288635),
 ('무언가/Noun', 0.38294756412506104),
 ('극도/Noun', 0.37472257018089294),
 ('코메디/Noun', 0.37199169397354126),
 ('의사/Noun', 0.3696790635585785)]


  if np.issubdtype(vec.dtype, np.int):


In [7]:
doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')

  if np.issubdtype(vec.dtype, np.int):


-0.13704649

In [8]:
pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'], 
                                      negative=['남자/Noun']))

[('복수/Noun', 0.37363946437835693),
 ('코메디/Noun', 0.3669866919517517),
 ('의사/Noun', 0.34402570128440857),
 ('어/Noun', 0.3352278172969818),
 ('자유/Noun', 0.33320215344429016),
 ('스릴러/Noun', 0.3313915431499481),
 ('지치다/Verb', 0.3213391602039337),
 ('재난영화/Noun', 0.3176781237125397),
 ('호러/Noun', 0.3121632933616638),
 ('무협/Noun', 0.30806395411491394)]


  if np.issubdtype(vec.dtype, np.int):


In [9]:
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]

array([-0.00107805,  0.00166835, -0.00612529,  0.00831873,  0.00147961,
       -0.00286954, -0.00450142, -0.00979538,  0.0006821 ,  0.0032758 ],
      dtype=float32)

In [10]:
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()

0.08101934