## install mecab on mac
```
brew install mecab mecab-ipadic  
pip install mecab-python3
```

## install neolog
```
git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
cd mecab-ipadic-neologd
./bin/install-mecab-ipadic-neologd -n
```

## get Japanese articles
livedoor ニュースコーパス

```
wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
tar xvzf ldcc-20140209.tar.gz
```

In [1]:
import MeCab
from urllib import request 
from pathlib import Path
from gensim import corpora, models
import numpy as np
import tqdm

In [2]:
mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")

In [4]:
[line.split("\t") for line in mecab.parse("今日も1日がんばるぞい！").split("\t\t\n")]

[['今日', 'キョウ', '今日', '名詞-副詞可能'],
 ['も', 'モ', 'も', '助詞-係助詞'],
 ['1日', 'ツイタチ', '1日', '名詞-固有名詞-一般'],
 ['がん', 'ガン', 'がん', '名詞-一般'],
 ['ばる', 'バル', 'バル', '名詞-一般'],
 ['ぞい', 'ゾイ', 'ぞい', '名詞-接尾-一般'],
 ['！', '！', '！', '記号-一般'],
 ['EOS\n']]

In [5]:
doc_dir = Path("./text/")
dirs = [i for i in doc_dir.iterdir() if i.is_dir()]
dirs

[PosixPath('text/dokujo-tsushin'),
 PosixPath('text/it-life-hack'),
 PosixPath('text/kaden-channel'),
 PosixPath('text/livedoor-homme'),
 PosixPath('text/movie-enter'),
 PosixPath('text/peachy'),
 PosixPath('text/smax'),
 PosixPath('text/sports-watch'),
 PosixPath('text/topic-news')]

In [6]:
articles = [a for categ in dirs for a in categ.iterdir()]

In [7]:
len(articles)

7376

In [8]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]
#stopwords

In [44]:
def tokenizer(text):
    l = [line.split("\t") for line in mecab.parse(text).split("\n")]
    res = [i[0] for i in l 
                   if len(i) >=4 
                       and ("名詞" in i[3] or "動詞" in i[3] or "形容詞" in i[3] )
                       and "数" not in i[3] and "助動詞" not in i[3] 
                       and i[0] not in stopwords
            ]
    return res
tokenizer("認めたくないものだな、自分自身の若さ故の過ちというものを")

['認め', '自分自身', '若さ故の過ち']

In [40]:
[line.split("\t") for line in mecab.parse("認めたくないものだな、自分自身の若さ故の過ちというものを").split("\n")]

[['認め', 'ミトメ', '認める', '動詞-自立', '一段', '連用形'],
 ['たく', 'タク', 'たい', '助動詞', '特殊・タイ', '連用テ接続'],
 ['ない', 'ナイ', 'ない', '助動詞', '特殊・ナイ', '基本形'],
 ['もの', 'モノ', 'もの', '名詞-非自立-一般', '', ''],
 ['だ', 'ダ', 'だ', '助動詞', '特殊・ダ', '基本形'],
 ['な', 'ナ', 'な', '助詞-終助詞', '', ''],
 ['、', '、', '、', '記号-読点', '', ''],
 ['自分自身', 'ジブンジシン', '自分自身', '名詞-固有名詞-一般', '', ''],
 ['の', 'ノ', 'の', '助詞-連体化', '', ''],
 ['若さ故の過ち', 'ワカサユエノアヤマチ', '若さ故の過ち', '名詞-固有名詞-一般', '', ''],
 ['という', 'トイウ', 'という', '助詞-格助詞-連語', '', ''],
 ['もの', 'モノ', 'もの', '名詞-非自立-一般', '', ''],
 ['を', 'ヲ', 'を', '助詞-格助詞-一般', '', ''],
 ['EOS'],
 ['']]

In [11]:
docs = []
for a in tqdm.tqdm(articles):
    with a.open() as f:
        f.readline()
        f.readline()
        docs.append(tokenizer(f.read()))

100%|██████████| 7376/7376 [00:24<00:00, 297.63it/s]


In [12]:
docs[0][:10]

['友人', '代表', 'スピーチ', '独女', 'ジューン・ブライド', '独女', 'お祝い', '貧乏', '状態', '出席']

In [13]:
no_below = 5
no_above = 0.2
d = corpora.Dictionary(docs)
d.filter_extremes(no_below, no_above)
d.compactify()

In [14]:
dic_num = len(d)
dic_num

20849

In [15]:
d.doc2bow(docs[0][:10])

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1)]

(id, count)

In [16]:
d[7] # get word from id

'出席'

In [17]:
corpus = [d.doc2bow(w) for w in docs]

In [None]:
corpus_words = sum(count for doc in corpus for id, count in doc)
corpus_words

In [None]:
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=d, num_topics=10, iterations=100, passes=5)

In [None]:
lda.print_topics(5)

## see on tensorboard

http://projector.tensorflow.org/  
ref: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Tensorboard_visualizations.ipynb


In [None]:
all_topics = lda.get_document_topics(corpus, minimum_probability=0)
all_topics[0]

In [None]:
with open('doc_lda_tensor.tsv','w') as w:
    for doc_topics in all_topics:
        for topics in doc_topics:
            w.write(str(topics[1])+ "\t")
        w.write("\n")    

In [None]:
meta = [str(a).split("/") for a in articles]

In [None]:
meta[0]

In [None]:
with open('doc_lda_metadata.tsv','w') as w:
    w.write('Titles\tGenres\n')
    for m in meta:
        w.write("%s\t%s\n" % (m[1][:2], m[1]))

## perplexity

In [None]:
lda.log_perplexity(corpus)

Estimate the variational bound of documents from corpus: E_q[log p(corpus)] - E_q[log q(corpus)]

In [None]:
bound = lda.bound(corpus)/corpus_words
bound #= lda.log_perplexity(corpus)

In [None]:
import numpy as np

In [None]:
perplexity = np.exp2(-bound ) # 2^(-bound per words)

In [None]:
lda.top_topics(corpus=corpus, num_words=10)

## 類似度

In [18]:
corpora.BleiCorpus.serialize("./corpus.blei", corpus)

In [30]:
blei_corpus = corpora.BleiCorpus('./corpus.blei', './corpus.blei.vocab')

In [32]:
model = models.ldamodel.LdaModel(
blei_corpus,
num_topics=100,
id2word=blei_corpus.id2word)

In [21]:
topics = [model[c] for c in blei_corpus]

In [22]:
topics[0]

[(3, 0.39135479635292925),
 (6, 0.28314725913005617),
 (7, 0.158951553032459),
 (8, 0.16203422527592143)]

In [23]:
dense = np.zeros( (len(topics), 100), float)
for ti,t in enumerate(topics):
    for tj,v in t:
        dense[ti,tj] = v

In [25]:
dense

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.06862076,  0.        ,  0.18922655, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.49557075,  0.02240851,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.14817951, ...,  0.        ,
         0.        ,  0.        ]])

In [24]:
from scipy.spatial import distance
pairwise = distance.squareform(distance.pdist(dense,"cosine"))

In [None]:
largest = pairwise.max()
for ti in range(len(topics)):
    pairwise[ti,ti] = largest + 1

In [None]:
def closest_to(doc_id):
    return pairwise[doc_id].argmin()

In [None]:
def read_doc(doc_id):
    with articles[doc_id].open() as f:
        print(f.read())

In [None]:
closest_to(3)

In [None]:
read_doc(4772)

In [None]:
read_doc(4442)