## install mecab on mac
```
brew install mecab mecab-ipadic  
pip install mecab-python3
```

## install neolog
```
git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
cd mecab-ipadic-neologd
./bin/install-mecab-ipadic-neologd -n
```

## get Japanese articles
livedoor ニュースコーパス

```
wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
tar xvzf ldcc-20140209.tar.gz
```

In [1]:
import MeCab
from urllib import request 
from pathlib import Path
from gensim import corpora, models
import tqdm

In [2]:
mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")

In [3]:
[line.split("\t") for line in mecab.parse("今日も1日がんばるぞい！1000").split("\t\t\n")]

[['今日', 'キョウ', '今日', '名詞-副詞可能'],
 ['も', 'モ', 'も', '助詞-係助詞'],
 ['1日', 'ツイタチ', '1日', '名詞-固有名詞-一般'],
 ['がん', 'ガン', 'がん', '名詞-一般'],
 ['ばる', 'バル', 'バル', '名詞-一般'],
 ['ぞい', 'ゾイ', 'ぞい', '名詞-接尾-一般'],
 ['！', '！', '！', '記号-一般'],
 ['1000', '1000', '1000', '名詞-数'],
 ['EOS\n']]

In [4]:
doc_dir = Path("./text/")
dirs = [i for i in doc_dir.iterdir() if i.is_dir()]
dirs

[PosixPath('text/dokujo-tsushin'),
 PosixPath('text/it-life-hack'),
 PosixPath('text/kaden-channel'),
 PosixPath('text/livedoor-homme'),
 PosixPath('text/movie-enter'),
 PosixPath('text/peachy'),
 PosixPath('text/smax'),
 PosixPath('text/sports-watch'),
 PosixPath('text/topic-news')]

In [7]:
articles = [a for categ in dirs for a in categ.iterdir()]

In [8]:
len(articles)

7376

In [9]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]
#stopwords

In [10]:
def tokenizer(text):
    l = [line.split("\t") for line in mecab.parse(text).split("\t\t\n")]
    res = [i[0] for i in l 
                   if len(i)==4 
                       and "名詞" in i[3] 
                       and "数" not in i[3] 
                       and i[0] not in stopwords
            ]
    return res
tokenizer("青葉「ま・・・まさか！正社員ってお給料を安くするための法の抜け穴・・・」1000")

['青葉', '正社員', 'お', '給料', '抜け穴']

In [11]:
docs = []
for a in tqdm.tqdm(articles):
    with a.open() as f:
        f.readline()
        f.readline()
        docs.append(tokenizer(f.read()))

100%|██████████| 7376/7376 [00:25<00:00, 293.51it/s]


In [66]:
docs[0][:10]

['友人', '代表', 'スピーチ', '独女', 'ジューン・ブライド', '独女', 'お祝い', '貧乏', '状態', '出席']

In [49]:
no_below = 5
no_above = 0.2
d = corpora.Dictionary(docs)
d.filter_extremes(no_below, no_above)
d.compactify()

In [71]:
dic_num = len(d)
dic_num

20849

In [57]:
d.doc2bow(docs[0][:10])

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1)]

(id, count)

In [73]:
d[7] # get word from id

'出席'

In [13]:
corpus = [d.doc2bow(w) for w in docs]

In [70]:
corpus_words = sum(count for doc in corpus for id, count in doc)
corpus_words

1110758

In [15]:
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=d, num_topics=10, iterations=100, passes=5)

In [16]:
lda.print_topics(5)

[(6,
  '0.011*"仕事" + 0.010*"女性" + 0.006*"男性" + 0.005*"転職" + 0.005*"会社" + 0.005*"livedoor" + 0.004*"結婚" + 0.004*"の" + 0.004*"年収" + 0.003*"相手"'),
 (0,
  '0.015*"氏" + 0.010*"放送" + 0.010*"韓国" + 0.008*"番組" + 0.008*"選手" + 0.007*"声" + 0.007*"批判" + 0.007*"試合" + 0.006*"監督" + 0.006*"同"'),
 (3,
  '0.007*"ネット掲示板" + 0.007*"アナ" + 0.006*"ファッション" + 0.005*"自殺" + 0.004*"球団" + 0.004*"ボール" + 0.004*"女子" + 0.004*"物議" + 0.004*"大" + 0.004*"足"'),
 (1,
  '0.013*"更新" + 0.013*"対応" + 0.012*"D" + 0.011*"搭載" + 0.010*"スマートフォン" + 0.010*"S" + 0.009*"ソフトウェア" + 0.009*"機能" + 0.008*"MAX" + 0.007*"利用"'),
 (4,
  '0.025*"映画" + 0.014*"公開" + 0.009*"監督" + 0.007*"作品" + 0.006*"本作" + 0.005*"世界" + 0.005*"役" + 0.005*"大" + 0.004*"特集" + 0.004*"主演"')]

## see on tensorboard

http://projector.tensorflow.org/  
ref: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Tensorboard_visualizations.ipynb


In [18]:
all_topics = lda.get_document_topics(corpus, minimum_probability=0)
all_topics[0]

[(0, 0.24285481438366582),
 (1, 0.00075203810829471934),
 (2, 0.12574257757405069),
 (3, 0.00075203970888466584),
 (4, 0.00075196348648189158),
 (5, 0.0007520392488661945),
 (6, 0.50031870821471058),
 (7, 0.00075199303843009706),
 (8, 0.056999363162606884),
 (9, 0.070324463074008664)]

In [29]:
with open('doc_lda_tensor.tsv','w') as w:
    for doc_topics in all_topics:
        for topics in doc_topics:
            w.write(str(topics[1])+ "\t")
        w.write("\n")    

In [30]:
meta = [str(a).split("/") for a in articles]

In [31]:
meta[0]

['text', 'dokujo-tsushin', 'dokujo-tsushin-4778030.txt']

In [35]:
with open('doc_lda_metadata.tsv','w') as w:
    w.write('Titles\tGenres\n')
    for m in meta:
        w.write("%s\t%s\n" % (m[1][:2], m[1]))

In [37]:
lda.log_perplexity(corpus)

-8.3996888956204021

Estimate the variational bound of documents from corpus: E_q[log p(corpus)] - E_q[log q(corpus)]

In [81]:
bound = lda.bound(corpus)/corpus_words
bound #= lda.log_perplexity(corpus)

-8.3996936772887612

In [40]:
import numpy as np

In [79]:
perplexity = np.exp2(-bound ) # 2^(-bound per words)

337.72179225914755

In [84]:
lda.top_topics(corpus=corpus, num_words=10)

[([(0.01328046851550143, '更新'),
   (0.013236753901677648, '対応'),
   (0.012495687604693734, 'D'),
   (0.010980747308656643, '搭載'),
   (0.010283676558696307, 'スマートフォン'),
   (0.010155979436757315, 'S'),
   (0.0091380964103196013, 'ソフトウェア'),
   (0.0087274904173353825, '機能'),
   (0.0078818169260381817, 'MAX'),
   (0.0066500305949472151, '利用')],
  -39.204929033239843),
 ([(0.016670690028008028, 'ソフトバンク'),
   (0.016637290559786119, 'MAX'),
   (0.01361713936911171, 'S'),
   (0.010345780515399874, 'Twitter'),
   (0.0090625231312763347, '向け'),
   (0.0084070588020175412, 'アプリ'),
   (0.0083205884269381119, 'smaxjp'),
   (0.0079523490846985807, 'Android'),
   (0.0076016741217253096, 'on'),
   (0.0070336532209106717, 'HTC')],
  -53.1537712093875),
 ([(0.024672881972101948, '映画'),
   (0.014404324557891146, '公開'),
   (0.0092255439305609036, '監督'),
   (0.0074308503763662144, '作品'),
   (0.0064192282337798253, '本作'),
   (0.0052587949562587994, '世界'),
   (0.004572903582255333, '役'),
   (0.0045680349179048