<br></br>
# **gensim | doc2vec**
<br></br>
## **1 네이버 단어 전처리**
네이버 영화리뷰 단어모델 만들기

In [1]:
from konlpy.tag import Okt
twitter = Okt()

def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    from random import randint
    random_data = [data[randint(1, len(data))]  for no in range(int(len(data)/10)) ]
    return random_data

def tokenize(doc):
    return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

In [2]:
%%time
from collections import namedtuple
train_data        = read_data('../data/ratings_train.txt')
train_docs        = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
TaggedDocument    = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]

CPU times: user 54.4 s, sys: 305 ms, total: 54.7 s
Wall time: 42.2 s


In [3]:
from pprint import pprint
pprint(tagged_train_docs[0])

TaggedDocument(words=['쪼/Noun', '는/Josa', '힘/Noun', '이/Josa', '대단하다/Adjective', './Punctuation', '박스오피스/Noun', '1/Number', '위/Noun', '하다/Verb', '만해/Noun'], tags=['1'])


<br></br>
## **2 doc2vec 파라미터 설정 및 학습**

In [4]:
%%time
from gensim.models import doc2vec
doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)

for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs, 
                         total_examples = doc_vectorizer.corpus_count, 
                         epochs = doc_vectorizer.epochs)
    doc_vectorizer.alpha -= 0.002
    doc_vectorizer.min_alpha = doc_vectorizer.alpha 

# 학습이 완료된 모델의 데이터를 저장한다
doc_vectorizer.save('../data/doc2vec.model')

CPU times: user 55.7 s, sys: 3.55 s, total: 59.2 s
Wall time: 29.8 s


<br></br>
## **3 doc2Vec 모델활용**

In [5]:
from gensim.models import doc2vec
from pprint import pprint
doc_vectorizer = doc2vec.Doc2Vec.load('../data/doc2vec.model')

In [6]:
pprint(doc_vectorizer.wv.most_similar('공포/Noun'))

[('코미디/Noun', 0.4617426097393036),
 ('로맨스/Noun', 0.4353543519973755),
 ('경계/Noun', 0.40590929985046387),
 ('보이/Noun', 0.40513288974761963),
 ('스릴러/Noun', 0.4018145203590393),
 ('독일/Noun', 0.39838650822639465),
 ('코메디/Noun', 0.39736342430114746),
 ('재난영화/Noun', 0.3963260352611542),
 ('이어지다/Verb', 0.39346280694007874),
 ('스무살/Noun', 0.3892365097999573)]


  if np.issubdtype(vec.dtype, np.int):


In [7]:
doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')

  if np.issubdtype(vec.dtype, np.int):


-0.08924653

In [8]:
pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'], 
                                      negative=['남자/Noun']))

[('풍자/Noun', 0.3298577070236206),
 ('극한/Noun', 0.32718753814697266),
 ('예술/Noun', 0.3229265809059143),
 ('한계/Noun', 0.3211601674556732),
 ('추격/Noun', 0.3196162283420563),
 ('..!/Punctuation', 0.31661269068717957),
 ('마스터/Noun', 0.31202661991119385),
 ('기사/Noun', 0.3075469732284546),
 ('작위/Noun', 0.30395781993865967),
 ('전이/Noun', 0.2954846918582916)]


  if np.issubdtype(vec.dtype, np.int):


In [9]:
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]

array([-0.00221595, -0.01148298,  0.00479257,  0.00838931,  0.00047217,
       -0.00418797,  0.01052418, -0.00523602,  0.00747818,  0.00087289],
      dtype=float32)

In [10]:
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()

0.1386233