# Word Embedding Vector & Dict Data Save 

In [1]:
from kor_model.data_crawler import crawler
from kor_model.data_crawler import mecab
from kor_model.data_embed_model import build_data
from kor_model.config import config
from kor_model.ner_model.lstmcrf_model import NERModel
from kor_model.general_utils import get_logger
from kor_model.data_embed_model import data_utils
from kor_model.data_embed_model.data_utils import CoNLLDataset
from kor_model.data_embed_model import word2vec
from kor_model.data_embed_model import data_utils
import os

In [2]:
# (1) Train 파일을 Mecab 으로 Sentence Spliting & Morphing 작업 수행
mecab.tockenizer(config.train_filename, config.pos_path)

# (2) Word2Vec 를 이용하여 단어 단위로 Embedding Vector 를 구성 
embed_model = word2vec.train_w2v(config)


# (3) Generators Class 생성 Iterator 
dev   = CoNLLDataset(config.dev_filename, max_iter=config.max_iter)
test  = CoNLLDataset(config.test_filename, max_iter=config.max_iter)
train = CoNLLDataset(config.train_filename, max_iter=config.max_iter)

# (4) Data Set 에서 Word 와 Tag Distinct Value 를 추출 
vocab_words, vocab_tags = data_utils.get_vocabs([train, dev, test])

# (5) Word Embedding 에 등록된 Dict 와 훈련 Data Set 에 공통으로 있는 것만 사용 
vocab = vocab_words & set(embed_model.wv.index2word)
vocab.add(data_utils.UNK)

# (6) 훈련 데이터에서 Char Dict 추출 
vocab_chars = data_utils.get_char_vocab(train)

# (7) 모든 Dict 리스트 및 Vector 파일을 저장함 
# Char, Word, Tag 3가지에 대하여 Vector 변환을 위한 데이터 
data_utils.write_char_embedding(vocab_chars, config.charembed_filename)
data_utils.write_vocab(vocab_chars, config.chars_filename)
data_utils.write_vocab(vocab, config.words_filename)
data_utils.write_vocab(vocab_tags, config.tags_filename)
data_utils.export_trimmed_glove_vectors(vocab, embed_model, config.trimmed_filename)

tockenizing start
tockenizing done
word2vec train start
word2vec train done
Building vocab...
- done. 8 tokens
Writing vocab...
- done. 20 tokens
Writing vocab...
- done. 20 tokens
Writing vocab...
- done. 8 tokens
Writing vocab...
- done. 5 tokens


# Data Object Prepare

In [3]:
# (8) 위에서 저장한 파일들을 로드
embeddings = data_utils.get_trimmed_glove_vectors(config.trimmed_filename)
char_embedding = data_utils.get_trimmed_glove_vectors(config.charembed_filename)
vocab_words = data_utils.load_vocab(config.words_filename)
vocab_tags = data_utils.load_vocab(config.tags_filename)
vocab_chars = data_utils.load_vocab(config.chars_filename)

# (9) 데이터 필터링 작업을 위한 Method 
processing_word = data_utils.get_processing_word(vocab_words,
                                                 vocab_chars,
                                                 lowercase=config.lowercase,
                                                 chars=config.chars)
processing_tag = data_utils.get_processing_word(vocab_tags,
                                                lowercase=False)

# 최종적으로 훈련에 사용하는 데이터 객체 (Iterator)
dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter)
test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter)

# build model
model = NERModel(config, embeddings, ntags=len(vocab_tags),nchars=len(vocab_chars), logger=None, char_embed=char_embedding)
model.build()
model.train(train, dev, vocab_tags)
model.evaluate(test, vocab_tags)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Epoch 1 out of 50




- dev acc 4.55 - f1 0.00
- new best score!
Epoch 2 out of 50




- dev acc 4.55 - f1 0.00
- new best score!
Epoch 3 out of 50




- dev acc 31.82 - f1 0.00
- new best score!
Epoch 4 out of 50




- dev acc 31.82 - f1 0.00
- new best score!
Epoch 5 out of 50




- dev acc 31.82 - f1 8.00
- new best score!
Epoch 6 out of 50




- dev acc 40.91 - f1 45.16
- new best score!
Epoch 7 out of 50




- dev acc 50.00 - f1 45.16
- new best score!
Epoch 8 out of 50




- dev acc 50.00 - f1 45.16
- new best score!
Epoch 9 out of 50




- dev acc 54.55 - f1 45.16
- new best score!
Epoch 10 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 11 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 12 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 13 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 14 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 15 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 16 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 17 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 18 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 19 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 20 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 21 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 22 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 23 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 24 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 25 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 26 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 27 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 28 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 29 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 30 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 31 out of 50




- dev acc 59.09 - f1 45.16
- new best score!
Epoch 32 out of 50




- dev acc 59.09 - f1 40.00
Epoch 33 out of 50




- dev acc 59.09 - f1 40.00
Epoch 34 out of 50




- dev acc 59.09 - f1 40.00
- early stopping 3 epochs without improvement
Testing model over test set


INFO:tensorflow:Restoring parameters from results/crf/model.weights/


Restoring parameters from results/crf/model.weights/
- test acc 59.09 - f1 45.16


# Prediction Test

In [4]:
model.predict(vocab_tags, processing_word, "김승우 이메일 검색")
model.predict(vocab_tags, processing_word, "김승우 전화 걸어줘")
model.predict(vocab_tags, processing_word, "김승우 이미지 검색")
model.predict(vocab_tags, processing_word, "김승우 검색 이미지")
model.predict(vocab_tags, processing_word, "김승우 걸어줘 전화")

INFO:tensorflow:Restoring parameters from results/crf/model.weights/


Restoring parameters from results/crf/model.weights/


['김승우', '이메일', '검색']
['B-PERSON', 'B-TARGET', 'B-TARGET']
INFO:tensorflow:Restoring parameters from results/crf/model.weights/


Restoring parameters from results/crf/model.weights/


['김승우', '전화', '걸어줘']
['B-PERSON', 'O', 'B-TARGET']
INFO:tensorflow:Restoring parameters from results/crf/model.weights/


Restoring parameters from results/crf/model.weights/


['김승우', '이미지', '검색']
['B-PERSON', 'B-TARGET', 'B-TARGET']
INFO:tensorflow:Restoring parameters from results/crf/model.weights/


Restoring parameters from results/crf/model.weights/


['김승우', '검색', '이미지']
['B-PERSON', 'O', 'B-TARGET']
INFO:tensorflow:Restoring parameters from results/crf/model.weights/


Restoring parameters from results/crf/model.weights/


['김승우', '걸어줘', '전화']
['B-PERSON', 'O', 'B-TARGET']
