# 2-2. Doc2vec Model Building & Training
doc2vec은 단어와 레이블에 대한 표현을 동시에 학습합니다.

In [1]:
import pickle
import pandas as pd
from pandas import DataFrame as df
from collections import namedtuple
from gensim.models import doc2vec

#### 앞서 저장한 train, test 데이터를 불러옵니다.

In [2]:
with open('train.txt', 'rb') as f:
    train = pickle.load(f)
    
with open('test.txt', 'rb') as f:
    test = pickle.load(f)

In [3]:
print(len(train))
print(len(test))

7756
1940


#### namedtuple 형태로 변환해줍니다.

In [4]:
from collections import namedtuple
TaggedDocument = namedtuple('TaggedDocument', 'words tags')

tagged_train = [TaggedDocument(d, [c]) for d, c in train]
tagged_test = [TaggedDocument(d, [c]) for d, c in test]

In [5]:
df(tagged_test)

Unnamed: 0,words,tags
0,"[눈/Noun, 내리다/Verb, 서울/Noun, 용산구/Noun, 이태원로/Nou...",[2]
1,"[알다/Verb, 장기/Noun, 되다/Verb, 경기/Noun, 침체/Noun, ...",[2]
2,"[밝다/Verb, 표정/Noun, 악수/Noun, 하다/Verb, 하다/Verb, ...",[0]
3,"[중국/Noun, 환구/Noun, 시보/Noun, 인/Noun, 민망/Noun, 다...",[4]
4,"[월미은하레일/Noun, 연합뉴스/Noun, 자료/Noun, 사진/Noun, 월미은...",[3]
5,"[차/Noun, 산업혁명/Noun, 선도/Noun, 기반/Noun, 구축/Noun,...",[5]
6,"[김포공항/Noun, 환경미화원/Noun, 뉴스/Noun, News/Alpha, 박...",[2]
7,"[향원정/Noun, 취향/Noun, 교/Noun, 연합뉴스/Noun, 자료/Noun...",[3]
8,"[국내/Noun, 예약/Noun, 판매/Noun, 반응/Noun, 시들다/Verb,...",[5]
9,"[고동진/Noun, 삼성/Noun, 전자/Noun, 무선/Noun, 사업/Noun,...",[5]


## Doc2vec model building

#### hyper parameters
* *alpha = 0.025*  :  the initial learning rate
* *min_alpha = 0.001* :  min learning-rate
* *min_count = 5* : 5번 이하 나온 단어 삭제
* *sample=1e-5* : Subsampling Frequent Words (너무 많이 나온단어 빈도 줄이기)
* *hs=0, negative = 5* : negative sampling
* *iter = 10 *  : 20보다 10일 때 더 성능이 좋았음
* *dbow_words=1* : 1이면 word-vector와 DBOW doc-vector를 함께 training 한다. default is 0 (faster training of doc-vectors only).
* selected_POS = (Noun, Verb, Adjective, Alpha)

In [6]:
import multiprocessing
cores = multiprocessing.cpu_count()  # 멀티 쓰레드

## (1) PV-DM

In [7]:
# 모델 bulid
DM_model1 = doc2vec.Doc2Vec( dm=1,  # PV-DM 
                            size=100, window=5,#the maximum distance between the predicted word and context words
                            iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, # 멀티 쓰레드
                            sample=1e-5,  hs=0, negative = 5 #negative = if > 0, negative sampling will be used (default)
                           )

DM_model2 = doc2vec.Doc2Vec( dm=1, size=100, window=10, dbow_words = 1,
                           iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

DM_model3 = doc2vec.Doc2Vec( dm=1, size=100, window=20, dbow_words = 1,
                            iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

DM_model4 = doc2vec.Doc2Vec( dm=1, size=300, window=5, dbow_words = 1,
                            iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

DM_model5 = doc2vec.Doc2Vec( dm=1, size=300, window=10,  dbow_words = 1,
                            iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

DM_model6 = doc2vec.Doc2Vec( dm=1, size=300, window=20, dbow_words = 1,
                            iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

In [8]:
# 사전 구축
DM_model1.build_vocab(tagged_train)
DM_model2.build_vocab(tagged_train)
DM_model3.build_vocab(tagged_train)
DM_model4.build_vocab(tagged_train)
DM_model5.build_vocab(tagged_train)
DM_model6.build_vocab(tagged_train)

In [9]:
# 훈련 시작
DM_model1.train(tagged_train, total_examples=len(tagged_train), epochs=DM_model1.iter)
DM_model2.train(tagged_train, total_examples=len(tagged_train), epochs=DM_model2.iter)
DM_model3.train(tagged_train, total_examples=len(tagged_train), epochs=DM_model3.iter)
DM_model4.train(tagged_train, total_examples=len(tagged_train), epochs=DM_model4.iter)
DM_model5.train(tagged_train, total_examples=len(tagged_train), epochs=DM_model5.iter)
DM_model6.train(tagged_train, total_examples=len(tagged_train), epochs=DM_model6.iter)

13248914

In [10]:
# 모델 저장
DM_model1.save("DM_model1.model")
DM_model2.save("DM_model2.model")
DM_model3.save("DM_model3.model")
DM_model4.save("DM_model4.model")
DM_model5.save("DM_model5.model")
DM_model6.save("DM_model6.model")

## (2) PV-DBOW

In [11]:
# 모델 bulid
DBOW_model1 = doc2vec.Doc2Vec( dm=0, size=100, window=5, dbow_words = 1,
                           iter = 10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

DBOW_model2 = doc2vec.Doc2Vec( dm=0, size=100, window=10, dbow_words = 1,
                           iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

DBOW_model3 = doc2vec.Doc2Vec( dm=0, size=100, window=20, dbow_words = 1,
                            iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

DBOW_model4 = doc2vec.Doc2Vec( dm=0, size=300, window=5, dbow_words = 1,
                            iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

DBOW_model5 = doc2vec.Doc2Vec( dm=0, size=300, window=10,  dbow_words = 1,
                            iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

DBOW_model6 = doc2vec.Doc2Vec( dm=0, size=300, window=20, dbow_words = 1,
                            iter=10, min_count=5, alpha=0.025, min_alpha=0.001, seed=1234, workers=cores, sample=1e-5,  hs=0, negative = 5 )

In [12]:
# 사전 구축
DBOW_model1.build_vocab(tagged_train)
DBOW_model2.build_vocab(tagged_train)
DBOW_model3.build_vocab(tagged_train)
DBOW_model4.build_vocab(tagged_train)
DBOW_model5.build_vocab(tagged_train)
DBOW_model6.build_vocab(tagged_train)

In [13]:
# 훈련 시작
DBOW_model1.train(tagged_train, total_examples=len(tagged_train), epochs=DBOW_model1.iter)
DBOW_model2.train(tagged_train, total_examples=len(tagged_train), epochs=DBOW_model2.iter)
DBOW_model3.train(tagged_train, total_examples=len(tagged_train), epochs=DBOW_model3.iter)
DBOW_model4.train(tagged_train, total_examples=len(tagged_train), epochs=DBOW_model4.iter)
DBOW_model5.train(tagged_train, total_examples=len(tagged_train), epochs=DBOW_model5.iter)
DBOW_model6.train(tagged_train, total_examples=len(tagged_train), epochs=DBOW_model6.iter)

13249105

In [14]:
# 모델 저장
DBOW_model1.save("DBOW_model1.model")
DBOW_model2.save("DBOW_model2.model")
DBOW_model3.save("DBOW_model3.model")
DBOW_model4.save("DBOW_model4.model")
DBOW_model5.save("DBOW_model5.model")
DBOW_model6.save("DBOW_model6.model")

#### 각 모델의 test code는 'D2V_models_test' 폴더에 저장되어있습니다.