In [2]:
import itertools
import random
import MeCab
from urllib import request 
from pathlib import Path
import re
import numpy as np
from scipy.special import digamma

try:
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        print("notebook")
        from tqdm import tqdm_notebook as tqdm
    else:
        raise RuntimeError
except (NameError, RuntimeError):
    from tqdm import tqdm



notebook


In [3]:
mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")

In [4]:
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]
print(len(stopwords))

330


In [5]:
class Tokenizer:
    def __init__(self, stopwords, parser=None, include_pos=None, exclude_posdetail=None, exclude_reg=None):
    
        self.stopwords = stopwords
        self.include_pos = include_pos if include_pos else  ["名詞", "動詞", "形容詞"]
        self.exclude_posdetail = exclude_posdetail if exclude_posdetail else ["接尾", "数"]
        self.exclude_reg = exclude_reg if exclude_reg else r"$^"  # no matching reg
        if parser:
            self.parser = parser
        else:
            mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/")
            self.parser = mecab.parse
            

    def tokenize(self, text, show_pos=False):
        text = re.sub(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", "", text)    #URL
        text = re.sub(r"\"?([-a-zA-Z0-9.`?{}]+\.jp)\"?" ,"", text)  # xxx.jp 
        text = text.lower()
        l = [line.split("\t") for line in self.parser(text).split("\n")]
        res = [
            i[2] if not show_pos else (i[2],i[3]) for i in l 
                if len(i) >=4 # has POS.
                    and i[3].split("-")[0] in self.include_pos
                    and i[3].split("-")[1] not in self.exclude_posdetail
                    and not re.search(r"(-|−)\d", i[2])
                    and not re.search(self.exclude_reg, i[2])
                    and i[2] not in self.stopwords          
            ]
        return res
t = Tokenizer(stopwords + ["…。"] , mecab.parse, exclude_reg=r"\d(年|月|日)")

In [6]:
doc_path = "./text/"
doc_dir = Path(doc_path)
dirs = [i for i in doc_dir.iterdir() if i.is_dir()]
dirs

[PosixPath('text/movie-enter'),
 PosixPath('text/it-life-hack'),
 PosixPath('text/kaden-channel'),
 PosixPath('text/topic-news'),
 PosixPath('text/livedoor-homme'),
 PosixPath('text/peachy'),
 PosixPath('text/sports-watch'),
 PosixPath('text/dokujo-tsushin'),
 PosixPath('text/smax')]

In [7]:
articles = [a for categ in dirs for a in categ.iterdir()]

In [8]:
len(articles)

7376

In [9]:
articles = articles[:50]

In [10]:
class Doc_manager():
    def __init__(self, docs):
        self.docs = docs
        
    def read_doc(self, doc_id):
        with self.docs[doc_id].open() as f:
            print(f.read())

In [11]:
dm = Doc_manager(articles)

In [13]:
docs = []
for a in tqdm(articles):
    with a.open() as f:
        f.readline()
        f.readline()
        docs.append(t.tokenize(f.read()))




In [14]:
dictionary = {}
vocab = set(w for d in docs for w in d)
word2id = dict(zip(vocab, itertools.count()))
id2word = dict(list(enumerate(vocab)))

In [15]:
corpus = []
for d in docs:
    corpus.append([word2id[w] for w in d])

In [16]:
random.shuffle(corpus)

In [17]:
id2word[0]

'職場'

In [18]:
test_size = int(len(corpus) * 0.3)
test_corpus = corpus[:test_size]
train_corpus= corpus[test_size:]

In [19]:
M = len(train_corpus)
V =  len(vocab)
k = 20

In [20]:
print(M,V,k)

35 4764 20


In [23]:
gamma_ = np.random.rand(M, k)
lambda_ = np.random.rand(V, k)
q_ = np.random.rand(M,V,k)

In [24]:
print(gamma_.shape)
print(lambda_.shape)
print(q_.shape)

(35, 20)
(4764, 20)
(35, 4764, 20)


In [25]:
from scipy.special import digamma

In [26]:
gamma_ = np.random.rand(M, k)
lambda_ = np.random.rand(V, k)
q_ = np.random.rand(M,V,k)

def get_per(corpus, alpha, eta, n_itr=100):
    perplexity = 0.0
    N = 0
    for d in corpus:
        N += len(d)
    for _ in range(n_itr):
        theta = np.array([np.random.dirichlet(a) for a in alpha])
        beta = np.array([np.random.dirichlet(e) for e in eta.T])
        m = np.inner(theta, beta.T)
        log_p = 0.0
        for i, d in enumerate(corpus):
            log_p += np.log(m[i][d]).sum()
        perplexity += np.exp(-log_p/N)
    perplexity = perplexity/n_itr
    return perplexity

In [27]:
alpha = gamma_
eta = lambda_
get_per(test_corpus, alpha, eta, n_itr=100)

5576.0176935432455

In [28]:
N = 0
for d in test_corpus:
        N += len(d)
print(N)

4239


In [29]:
for itr in tqdm(range(50)):
    
    for d in range(M):
        doc = train_corpus[d]
        N_d = len(doc)
        for n in range(N_d):
            gamma_sum = gamma_.sum(axis=1)
            lambda_sum = lambda_.sum(axis=0)
            w = int(doc[n])

            q_[d,w] = np.exp(digamma(gamma_[d]) - digamma(gamma_sum)[d] + digamma(lambda_[w]) - digamma(lambda_sum))
            q_ = q_/q_.sum()
            gamma_[d] += q_.sum(axis=1)[d]
            lambda_[w] += q_.sum(axis=0)[w]
            

    alpha = gamma_ - q_.sum(axis=1)
    eta = lambda_ - q_.sum(axis=0)
    
    perplexity = get_per(test_corpus, alpha, eta, n_itr=200)
    print(itr, ": ", perplexity)


0 :  5314.943479750492
1 :  5195.217753467785
2 :  5111.026171381576
3 :  5047.939355209994
4 :  4973.880596674787
5 :  4893.566145257753
6 :  4834.691972501662
7 :  4748.105801850343
8 :  4678.647368751252
9 :  4602.4488802464675
10 :  4521.260414804595
11 :  4445.274752476373
12 :  4365.377202171497
13 :  4289.469088488443
14 :  4207.438404841112
15 :  4136.1289712688285
16 :  4068.548895399984
17 :  3998.350912163145
18 :  3940.502327263738
19 :  3883.769334117448
20 :  3841.410098102611
21 :  3806.064044578289
22 :  3786.6186937682055
23 :  3775.0701246693106
24 :  3780.2993390204924
25 :  3796.926603845982
26 :  3830.1465634823817
27 :  3875.9739458304725
28 :  3928.8825577777475
29 :  4007.586888187217
30 :  4093.5786286930706
31 :  4213.5131537868465
32 :  4327.414587680403
33 :  4478.269655954258
34 :  4627.5842399329795
35 :  4799.646767563641
36 :  4975.575916964332
37 :  5164.2798536396685
38 :  5358.374112847642
39 :  5563.220545263122
40 :  5780.550797380748
41 :  5997.298

In [None]:
k=4
14 :  5338.01566202
    
k=5
20 :  4814.45180145
    
k=6
20 :  4735.64121761
    
k=7    
20 :  4694.68933233

k=8
21 :  4635.48873153
    
k=9
24 :  4474.1900581
    
k=10
29 :  4421.17836746

In [31]:
theta = np.array([np.random.dirichlet(a) for a in alpha])
beta = np.array([np.random.dirichlet(e) for e in eta])

In [32]:
for i in range(k):
    print("# ", i)
    for t in sorted([(i,rate) for (i,rate) in enumerate(beta.T[i])], key=lambda t: t[1],reverse=True)[:10]:
        print(id2word[t[0]]," : ",t[1])
    print("\n")

#  0
気味  :  0.6239974196633218
仮面ライダーディケイド  :  0.550002625377603
フェンシング  :  0.5305533897272142
経験  :  0.5258745563426709
配役  :  0.5247843349045276
費  :  0.5129119445826025
連載小説  :  0.49354545769959257
映画界  :  0.48986686585353545
行き着い  :  0.48729532344139814
チェック  :  0.48125785886106737


#  1
先日  :  0.675954595493469
視野  :  0.5600913147302473
肉体  :  0.5040535279812864
競演  :  0.48290636687128646
ok  :  0.48160095040546885
収容  :  0.48013825323949866
兼ね備え  :  0.4694183366394489
前田旺志郎  :  0.46309297820766804
閉鎖  :  0.45856396857489934
ガッツ  :  0.4542355524795075


#  2
脚本  :  0.5062366285166927
バース  :  0.5054053307613474
草木  :  0.4828877913019305
燃料  :  0.47465927828613214
答える  :  0.47309643170500354
異なる  :  0.4643383977757699
早  :  0.44657344995565007
お前  :  0.4450108161323171
編集部  :  0.4371959397225338
薄く  :  0.4354819977563358


#  3
bad  :  0.6970073333462452
頂ける  :  0.5463353912014385
反応  :  0.5123431849390175
圧倒的  :  0.5073984612560681
モト冬樹  :  0.4761974464726004
衣装  :  0.463232439136