# Chosun News Tokenizer and Vector Space Models

### corpus information

~11M lines of Chosun Ilbo news articles

### Google sentencepiece tokenizer model

trained with vocab-size of 16,000

### character, mecab-morpheme and word-piece word2vec models

trained with `gensim` over tokenized corpus

a `generator` object was used to load sentences from disk; function is included for reference.

In [1]:
import json
import pickle
import sentencepiece as spm
from gensim.models import Word2Vec

In [2]:
# create embeddings with gensim
# filename : filepath to text file
def create_embeddings(file_name,
                      save_path='embeddings/my_model',
                      do_train=True,
                      **params):
    
    if do_train:
        class SentenceGenerator(object):
            def __init__(self, filename):
                self.filename = filename

            def __iter__(self):
                for line in codecs.open(self.filename, 'rU', encoding='utf-8'):
                    yield line.strip().split()

        sentences = SentenceGenerator(file_name)

        print("training", save_path, "model...")
        model = Word2Vec(sentences, **params)
        print("saving", save_path, "model...")
        model.save(save_path+'.gensimmodel')
    
    model = Word2Vec.load(save_path+'.gensimmodel')
    print("saving", save_path, "vocab...")
    # http://stackoverflow.com/questions/35596031/gensim-word2vec-find-number-of-words-in-vocabulary
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    with open(save_path+'.json', 'w') as f:
        f.write(json.dumps(vocab))

    return model, vocab 

In [3]:
def load_vocab(vocab_path='embeddings/mapping.json'):
    with open(vocab_path, 'r') as f:
        data = json.loads(f.read())
    word2idx = data
    idx2word = dict([(v, k) for k, v in data.items()])
    return word2idx, idx2word

In [4]:
# output filenames
w2v_mecabfile = 'embeddings/chosun_mecab_embeddings'
w2v_sentpfile = 'embeddings/chosun_sentp_embeddings'
w2v_charafile = 'embeddings/chosun_chara_embeddings' 

### mecab-tokenized Word2Vec model

In [5]:
mecab_w2v_model = Word2Vec.load(w2v_mecabfile+'.gensimmodel')
mecab_w2v_vocab, _ = load_vocab(w2v_mecabfile+'.json')
len(mecab_w2v_vocab)

267900

In [6]:
mecab_w2v_model.wv.most_similar('한국')

[('일본', 0.6532888412475586),
 ('미국', 0.6065130233764648),
 ('독일', 0.5748053789138794),
 ('중국', 0.5691663026809692),
 ('국내', 0.5494322776794434),
 ('유럽', 0.526393711566925),
 ('우리나라', 0.508824348449707),
 ('영국', 0.5084823369979858),
 ('호주', 0.5040222406387329),
 ('브라질', 0.49898624420166016)]

### sentence-piece tokenization and word2vecs

In [7]:
sp = spm.SentencePieceProcessor()
sp.Load("embeddings/chosun-16k.model")

True

In [8]:
sp.Encode("hello world!")

['▁h', 'ell', 'o', '▁', 'w', 'or', 'ld', '!']

In [9]:
sp.Encode("저는 한국말을 말할 수 있어요")

['▁저는', '▁한국', '말', '을', '▁말할', '▁수', '▁있어요']

In [10]:
wordp_w2v_model = Word2Vec.load(w2v_sentpfile+'.gensimmodel')
wordp_w2v_vocab, _ = load_vocab(w2v_sentpfile+'.json')
len(wordp_w2v_vocab)

27112

In [11]:
wordp_w2v_model.wv.most_similar('한국')

[('일본', 0.6959158182144165),
 ('미국', 0.6691223382949829),
 ('중국', 0.6461528539657593),
 ('우리나라', 0.6356861591339111),
 ('대한민국', 0.6356850862503052),
 ('국내', 0.6188637018203735),
 ('영국', 0.584417462348938),
 ('우리', 0.5596229434013367),
 ('▁한국', 0.5501009821891785),
 ('북한', 0.5448728799819946)]

### characters

In [12]:
chara_w2v_model = Word2Vec.load(w2v_charafile+'.gensimmodel')
chara_w2v_vocab, _ = load_vocab(w2v_charafile+'.json')
len(chara_w2v_vocab)

7320

In [13]:
chara_w2v_model.wv.most_similar('한')

[('했', 0.4814152717590332),
 ('된', 0.45396751165390015),
 ('하', 0.4360974431037903),
 ('해', 0.4352923631668091),
 ('룬', 0.4349149465560913),
 ('할', 0.43141090869903564),
 ('깬', 0.40105998516082764),
 ('쓴', 0.39839059114456177),
 ('킨', 0.3684242367744446),
 ('뗀', 0.3615623414516449)]