## install mecab on mac
```
brew install mecab mecab-ipadic  
pip install mecab-python3
```

## install neolog
```
git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
cd mecab-ipadic-neologd
./bin/install-mecab-ipadic-neologd -n
```

## get Japanese articles
livedoor ニュースコーパス

```
wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
tar xvzf ldcc-20140209.tar.gz
```

In [None]:
import MeCab
from urllib import request 
from pathlib import Path
from gensim import corpora, models
import numpy as np
import tqdm

In [None]:
mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")

In [None]:
[line.split("\t") for line in mecab.parse("今日も1日がんばるぞい！").split("\t\t\n")]

In [None]:
doc_dir = Path("./text/")
dirs = [i for i in doc_dir.iterdir() if i.is_dir()]
dirs

In [None]:
articles = [a for categ in dirs for a in categ.iterdir()]

In [None]:
len(articles)

In [None]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]
#stopwords

In [None]:
def tokenizer(text):
    l = [line.split("\t") for line in mecab.parse(text).split("\n")]
    res = [i[0] for i in l 
                   if len(i) >=4 
                       and ("名詞" in i[3] or "動詞" in i[3] or "形容詞" in i[3] )
                       and "数" not in i[3] and "助動詞" not in i[3] 
                       and i[0] not in stopwords
            ]
    return res
tokenizer("認めたくないものだな。自分自身の若さ故の過ちというものを。")

In [None]:
[line.split("\t") for line in mecab.parse("認めたくないものだな、自分自身の若さ故の過ちというものを").split("\n")]

In [None]:
docs = []
for a in tqdm.tqdm(articles):
    with a.open() as f:
        f.readline()
        f.readline()
        docs.append(tokenizer(f.read()))

In [None]:
docs[0][:10]

In [None]:
no_below = 5
no_above = 0.2
d = corpora.Dictionary(docs)
d.filter_extremes(no_below, no_above)
d.compactify()

In [None]:
dic_num = len(d)
dic_num

In [None]:
d.doc2bow(docs[0][:10])

(id, count)

In [None]:
d[7] # get word from id

In [None]:
corpus = [d.doc2bow(w) for w in docs]

In [None]:
corpus_words = sum(count for doc in corpus for id, count in doc)
corpus_words

In [None]:
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=d, num_topics=10, iterations=100, passes=5)

In [None]:
lda.print_topics(5)

## see on tensorboard

http://projector.tensorflow.org/  
ref: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Tensorboard_visualizations.ipynb


In [None]:
all_topics = lda.get_document_topics(corpus, minimum_probability=0)
all_topics[0]

In [None]:
with open('doc_lda_tensor.tsv','w') as w:
    for doc_topics in all_topics:
        for topics in doc_topics:
            w.write(str(topics[1])+ "\t")
        w.write("\n")    

In [None]:
meta = [str(a).split("/") for a in articles]

In [None]:
meta[0]

In [None]:
with open('doc_lda_metadata.tsv','w') as w:
    w.write('Titles\tGenres\n')
    for m in meta:
        w.write("%s\t%s\n" % (m[1][:2], m[1]))

## perplexity

In [None]:
lda.log_perplexity(corpus)

Estimate the variational bound of documents from corpus: E_q[log p(corpus)] - E_q[log q(corpus)]

In [None]:
bound = lda.bound(corpus)/corpus_words
bound #= lda.log_perplexity(corpus)

In [None]:
import numpy as np

In [None]:
perplexity = np.exp2(-bound ) # 2^(-bound per words)

In [None]:
lda.top_topics(corpus=corpus, num_words=10)

## 類似度

In [None]:
corpora.BleiCorpus.serialize("./corpus.blei", corpus)

In [None]:
blei_corpus = corpora.BleiCorpus('./corpus.blei', './corpus.blei.vocab')

In [None]:
model = models.ldamodel.LdaModel(
blei_corpus,
num_topics=100,
id2word=blei_corpus.id2word)

In [None]:
topics = [model[c] for c in blei_corpus]

In [None]:
topics[0]

In [None]:
dense = np.zeros( (len(topics), 100), float)
for ti,t in enumerate(topics):
    for tj,v in t:
        dense[ti,tj] = v

In [None]:
dense

In [None]:
from scipy.spatial import distance
pairwise = distance.squareform(distance.pdist(dense,"cosine"))

In [None]:
largest = pairwise.max()
for ti in range(len(topics)):
    pairwise[ti,ti] = largest + 1

In [None]:
def closest_to(doc_id):
    return pairwise[doc_id].argmin()

In [None]:
def read_doc(doc_id):
    with articles[doc_id].open() as f:
        print(f.read())

In [None]:
closest_to(3)

In [None]:
read_doc(4772)

In [None]:
read_doc(4442)