<a href="https://colab.research.google.com/github/ShinAsakawa/ShinAsakawa.github.io/blob/master/notebooks/2021_0429onomatopia_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# オノマトペの音韻表現を得る

In [1]:
# オノマトペ4500本から wikipedia_ja にエントリがあった単語を抜き出したデータファイルが
# onomatopa_list.txt である
# データの読み込み

from google.colab import files
files.upload()
with open('onomatopea_list.txt', 'r') as f:
    a = f.readlines()
onmtp_list = [w.strip() for w in a]  # 行末の改行記号の切り取り

!wget https://raw.githubusercontent.com/ShinAsakawa/ShinAsakawa.github.io/master/2020ccap/ja_util.py
# ja_util.py

--2021-04-29 04:55:44--  https://raw.githubusercontent.com/ShinAsakawa/ShinAsakawa.github.io/master/2020ccap/ja_util.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25198 (25K) [text/plain]
Saving to: ‘ja_util.py.2’


2021-04-29 04:55:44 (24.0 MB/s) - ‘ja_util.py.2’ saved [25198/25198]



In [3]:
#!pip install mecab-python3==0.996.3



In [4]:
# ja_util が  mora_wakati() が開発中のため，再読み込みが必要なので autoreload 2
%load_ext autoreload
%autoreload 2

import sys
sys.path.append(".")
import ja_util

import numpy as np
import json
!pip install jaconv
import jaconv



In [5]:
# データの作成
onmtp_dict = {}
mora_list = []
max_word_len = 0
for word in onmtp_list:
    word_len = len(word)
    if max_word_len < word_len:
        max_word_len = word_len
    morae = ja_util.mora_wakati().parse(word)
    for mora in morae:
        if not mora in mora_list:
            mora_list.append(mora)
    onmtp_dict[word] = morae

mora_list = sorted(mora_list)
mora_list.insert(0,'<eow>')
mora_list.append('<sow>')
#print(mora_list)
#print(onmtp_dict)

mora2idx = {m:i for i, m in enumerate(mora_list)}
idx2mora = {i:m for i, m in enumerate(mora_list)}
#print(mora2idx)
#print(idx2mora)

In [6]:
X = np.zeros((len(onmtp_list), max_word_len+2), dtype=np.int)
for i, word in enumerate(onmtp_list):
    X[i,0] = mora2idx['<sow>']
    for j, mora in enumerate(onmtp_dict[word]):
        X[i,j+1] = mora2idx[mora]
    X[i,j+2] = mora2idx['<eow>']

In [7]:
data = X
#chars = list(set(data))
chars = mora_list
#n_data, n_vocab = len(data), len(chars)
n_data = X.shape[0]
n_vocab = len(mora_list)

#print(f'#データの総文字数:{n_data}\t文字種:{n_vocab}')
#chr2idx = { ch:i for i,ch in enumerate(chars) }
#idx2chr = { i:ch for i,ch in enumerate(chars) }

In [8]:
# ハイパーパラメータ
n_hid = 20           # size of hidden layer of neurons
seq_len = X.shape[1] # number of steps to unroll the RNN for
lr = 1e-1

# 結合係数行列とバイアス項の初期化
Wxh = np.random.randn(n_hid, n_vocab) * 0.01 # input to hidden
Whh = np.random.randn(n_hid, n_hid)   * 0.01 # hidden to hidden
Why = np.random.randn(n_vocab, n_hid) * 0.01 # hidden to output
bh = np.zeros((n_hid, 1)) # hidden bias
by = np.zeros((n_vocab, 1)) # output bias

In [9]:
def loss_f(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    x_state, h_state, y_state, prob_state = {}, {}, {}, {}
    h_state[-1] = np.copy(hprev)
    loss = 0
    
    # forward pass
    for t in range(len(inputs)):
        x_state[t] = np.zeros((n_vocab,1)) # encode in 1-of-k representation
        x_state[t][inputs[t]] = 1
        h_state[t] = np.tanh(np.dot(Wxh, x_state[t]) + np.dot(Whh, h_state[t-1]) + bh) # hidden state
        y_state[t] = np.dot(Why, h_state[t]) + by     # unnormalized log probabilities for next chars
        prob_state[t] = np.exp(y_state[t]) / np.sum(np.exp(y_state[t])) # probabilities for next chars
        loss += -np.log(prob_state[t][targets[t],0]) # softmax (cross-entropy loss)  
        
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(h_state[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(prob_state[t])
        # backprop into y. 
        #see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dy[targets[t]] -= 1      
        dWhy += np.dot(dy, h_state[t].T)
        dby += dy
        
        # backprop into h
        dh = np.dot(Why.T, dy) + dhnext 
        
        #backprop through tanh nonlinearity        
        delta = (1 - h_state[t] * h_state[t]) * dh
        dbh  += delta
        dWxh += np.dot(delta, x_state[t].T)
        dWhh += np.dot(delta, h_state[t-1].T)
        dhnext = np.dot(Whh.T, delta)
            
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, h_state[len(inputs)-1]

def sample(hprev, seed, n):
    """ 
    sample a sequence of integers from the model 
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((n_vocab, 1))
    hid = hprev
    x[seed] = 1
    idxes = []
    for t in range(n):
        hid = np.tanh(np.dot(Wxh, x) + np.dot(Whh, hid) + bh)
        out = np.dot(Why, hid) + by
        prob = np.exp(out) / np.sum(np.exp(out))
        idx = np.random.choice(range(n_vocab), p=prob.ravel())
        x = np.zeros((n_vocab, 1))
        x[idx] = 1
        idxes.append(idx)
    return idxes

In [None]:
pos = 0

# m で始まる変数は，Adagrad で用いるメモリ変数。それぞれ，
# mWxh: 入力から中間層への結合係数行列
# mWhh: 中間層へのリカレント結合係数行列
# mWhy: 中間層から出力層への結合係数行列
# mbh: 中間層のバイアス項
# mby: 出力層のバイアス項
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) 

smooth_loss = -np.log(1.0/n_vocab) * seq_len # 時刻 0 での損失値

max_iter = 10 ** 5
interval = max_iter >> 2
losses = []
for itr in range(max_iter):
    
    # prepare inputs (we're sweeping from left to right in steps seq_len long)
    if pos + seq_len + 1 >= len(data) or itr == 0: 
        hprev = np.zeros((n_hid,1)) # reset RNN memory
        pos = 0 # go from start of data
    inputs = [chr2idx[ch] for ch in data[pos:pos+seq_len]]
    targets = [chr2idx[ch] for ch in data[pos+1:pos+seq_len+1]]

    # sample from the model now and then
    if itr % interval == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ''.join(idx2chr[ix] for ix in sample_ix)
        print(f'--- 反復訓練数={itr} ---\n{txt}\n---')

    # forward seq_len characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = loss_f(inputs, targets, hprev)
    losses.append(loss)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    if itr % interval == 0:
        print(f'反復学習回数:{itr:05d} 損失値:{smooth_loss:.3f}') # print progress

    # perform parameter update with Adagrad
    for param, _delta, _Hessian in zip([Wxh, Whh, Why, bh, by], 
                                       [dWxh, dWhh, dWhy, dbh, dby], 
                                       [mWxh, mWhh, mWhy, mbh, mby]):
        _Hessian += _delta * _delta
        param += -lr * _delta / np.sqrt(_Hessian + 1e-8) # adagrad update

    pos += seq_len # move data pointer

    
hprev = np.zeros((n_hid,1)) # reset RNN memory
sample_ix = sample(hprev, inputs[0], 200)
txt = ''.join(idx2chr[ix] for ix in sample_ix)
print(f'----\n{txt}\n----')
plt.plot(losses)


In [None]:
# 確認のため再読み込み
with open('onomatopea_morae.json', 'r') as f:
    onmtp_data = json.load(f)

In [None]:
list(onmtp_data)[:10]

In [None]:
print(sorted(mora_list))

In [None]:
mora2idx = {}
mora2idx['<eow>'] = 0
a = {}
b = []
for mora in mora_dict:
    for m in mora_dict[mora]:
        b.append(m)
        if m in a:
            a[m] += 1
        else:
            a[m] = 0

In [None]:
mora2idx

In [None]:
m_dict = {}
for i, m in enumerate(sorted(set(b))):
    m_dict[i] = {'no':i,
                 'idx': mora2idx[m],
                 'frq': a[m]
                }

In [None]:
mora2idx = ja_util.mora_wakati().mora2idx
idx2mora = ja_util.mora_wakati().idx2mora

start_mora = len(ja_util.mora_wakati().mora2idx) # + 1
mora2idx['<sow>'] = start_mora
idx2mora[start_mora] = '<sow>'

end_mora = 0 

max_mora_len = 9
Z = np.zeros((len(onmtp_data), max_mora_len+1), dtype=np.int)
for i, word in enumerate(onmtp_data):
    max_mora_len = len(onmtp_data[word]) if max_mora_len < len(onmtp_data[word]) else max_mora_len
    #print(word, onmtp_data[word], end="\t")
    Z[i,0] = start_mora
    for j, mora in enumerate(onmtp_data[word]):
        Z[i,j+1] = int(mora2idx[mora])
    Z[i,j+2] = end_mora

In [None]:
print(a)
#print(mora_dict)

In [None]:
#max_mora_len = 9
#print(Z[0])
#print(idx2mora)
#print(mora2idx)

In [None]:
#print(Z[0])
#print(idx2kana)
for i, w in enumerate(Z[:10]):
    print(list(onmtp_data)[i], end=" ")
    for x in w:
        if x > 0:
            print(x, idx2mora[x], end=' ')
    print()


In [None]:
import re
#re_mora = re.compile(ja_util.mora_wakati().cond)

c1 = '[ウクスツヌフムユルグズヅブプヴ][ァィェォ]' #ウ段＋「ァ/ィ/ェ/ォ」
c2 = '[イキシシニヒミリギジヂビピ][ャュェョ]' #イ段（「イ」を除く）＋「ャ/ュ/ェ/ョ」
c2 = '[イキシチニヒミリギジヂビピ][ャュェョ]' #イ段（「イ」を除く）＋「ャ/ュ/ェ/ョ」
c3 = '[テデ][ィュ]' #「テ/デ」＋「ャ/ィ/ュ/ョ」
c4 = '[ァィゥェォー]' #カタカナ１文字（長音含む）
c5 = '[，、.。「」]'
#c6 = '[ィ]' #カタカナ１文字（長音含む）
cond = '('+c1+'|'+c2+'|'+c3+'|'+c4+'|'+c5+')'
#cond = '('+self.c1+'|'+self.c2+'|'+self.c3+'|'+self.c4+'|'+self.c5+'|'+self.c6+')'
re_mora = re.compile(cond)

print(re_mora)
re_mora.findall('キェピャーエィー')

print(re_mora.findall('ホゲ'))
print(re_mora.findall('キェピャーエィー'))
print(re_mora.findall('ガッキュウホウカイ'))

In [None]:
import numpy as np
from gensim import corpora
from collections import defaultdict
from pprint import pprint

documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]
#texts_save = texts

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts1 = [[token for token in text if frequency[token] > 1]
         for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


In [None]:
#data I/O
data_input = open('input.txt', 'r').read()
#print(data_input)

texts = [[ch for ch in line] for line in data_input.split()]
#print(texts)

freq = defaultdict(int)
for text in texts:
    for ch in text:
        freq[ch] += 1
print(freq)
dic = corpora.Dictionary()

In [None]:
import unicodedata
import os
import sys
import MeCab
import glob

# 岩下先生から頂いた「みんなの日本語」データの読み込み
jlpt_base = '/Users/asakawa/study/2021jlpt'
minnichi_files = sorted(glob.glob(os.path.join(jlpt_base, 'MINNICHI_*.txt')))

# みんなの日本語テキストを読み込み
minnichi_text = {}
for file in minnichi_files:
    fname = os.path.split(file)[-1].split('.')[0]

    if not fname in minnichi_text:
        minnichi_text[fname] = []
    txt = []
    with open(file,'r') as f:
        texts = f.readlines()
        
        for txt in texts:
            txt = txt.strip()
            if len(txt) == 0: continue                 # 空行をスキップする
            #txt = unicodedata.normalize("NFKC", txt)  # 全角記号をざっくり半角へ置換（でも不完全）
            txt = unicodedata.normalize("NFC", txt)    # 全角記号をざっくり半角へ置換（でも不完全）
            minnichi_text[fname].append(txt)

print(minnichi_text['MINNICHI_D_005_03'])
#print(minnichi_text['MINNICHI_D_005_03'][5].replace("　","<spc>"))
#minnichi_text['MINNICHI_D_004_02']

In [None]:
%load_ext autoreload
%autoreload 2

# ja_util は 2021 Apr に作った日本語処理関係
import ja_util

In [None]:
#help(ja_util)

In [None]:
for fname in list(minnichi_text)[:3]:
    for line in minnichi_text[fname]:
        kata = MeCab.Tagger('-Oyomi').parse(line).strip()
        morae = ja_util.mora_wakati().parse(kata)
        print(f'{kata} ', end=": ")
        for mora in morae:
            print(ja_util.mora_wakati().kana2mora[mora], end=" ")
        print()
        #print(ja_util.mora_wakati().parse2romaji(kata))

In [None]:
with open('minnichi_all.txt', 'w') as f:
    for doc in minnichi_text:
        for line in minnichi_text[doc]:
            f.writelines(line+'\n')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

minnichi = open('minnichi_all.txt', 'r').read()
data = minnichi
chars = list(set(data))
n_data, n_vocab = len(data), len(chars)
print(f'#データの総文字数:{n_data}\t文字種:{n_vocab}')
chr2idx = { ch:i for i,ch in enumerate(chars) }
idx2chr = { i:ch for i,ch in enumerate(chars) }

In [None]:
params = {
    'n_data': n_data, 
    'n_vocab': n_vocab, 
    'chr2idx': chr2idx, 
    'idx2chr':idx2chr
}

In [None]:
# ハイパーパラメータ
n_hid = 100  # size of hidden layer of neurons
seq_len = 25 # number of steps to unroll the RNN for
lr = 1e-1

params['n_hid'] = n_hid
params['seq_len'] = seq_len
params['lr'] = lr

# 結合係数行列とバイアス項の初期化
Wxh = np.random.randn(n_hid, n_vocab)*0.01 # input to hidden
Whh = np.random.randn(n_hid, n_hid)*0.01 # hidden to hidden
Why = np.random.randn(n_vocab, n_hid)*0.01 # hidden to output
bh = np.zeros((n_hid, 1)) # hidden bias
by = np.zeros((n_vocab, 1)) # output bias

In [None]:
print(list(params))

In [None]:
def loss_f(inputs, targets, hprev):
    """
    引数:
    - inputs, targets: 共に int のリスト
    - hprev: 隠れ層の初期状態 Hx1 (H 行 x 1 列)
    
    戻り値:
    - loss: 損失値
    - dWxh, dWhh, dWhy, dbh, dby: 勾配 
    - h_state[len(inputs)-1]: 隠れ層の状態
    """
    x_state, h_state, y_state, prob_state = {}, {}, {}, {}
    h_state[-1] = np.copy(hprev)
    loss = 0
    
    # 前向きパス forward pass
    for t in range(len(inputs)):
        x_state[t] = np.zeros((n_vocab,1)) # ワンホット表現 encode in 1-of-k representation
        x_state[t][inputs[t]] = 1
        h_state[t] = np.tanh(np.dot(Wxh, x_state[t]) + np.dot(Whh, h_state[t-1]) + bh) # 隠れ層の状態
        y_state[t] = np.dot(Why, h_state[t]) + by     # unnormalized log probabilities for next chars
        prob_state[t] = np.exp(y_state[t]) / np.sum(np.exp(y_state[t])) # probabilities for next chars
        loss += -np.log(prob_state[t][targets[t],0]) # softmax (cross-entropy loss)  
        
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(h_state[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(prob_state[t])
        # backprop into y. 
        #see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dy[targets[t]] -= 1      
        dWhy += np.dot(dy, h_state[t].T)
        dby += dy
        
        # backprop into h
        dh = np.dot(Why.T, dy) + dhnext 
        
        #backprop through tanh nonlinearity        
        delta = (1 - h_state[t] * h_state[t]) * dh
        dbh  += delta
        dWxh += np.dot(delta, x_state[t].T)
        dWhh += np.dot(delta, h_state[t-1].T)
        dhnext = np.dot(Whh.T, delta)
            
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, h_state[len(inputs)-1]

def sample(hprev, seed, n):
    """ 
    sample a sequence of integers from the model 
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((n_vocab, 1))
    hid = hprev
    x[seed] = 1
    idxes = []
    for t in range(n):
        hid = np.tanh(np.dot(Wxh, x) + np.dot(Whh, hid) + bh)
        out = np.dot(Why, hid) + by
        prob = np.exp(out) / np.sum(np.exp(out))
        idx = np.random.choice(range(n_vocab), p=prob.ravel())
        x = np.zeros((n_vocab, 1))
        x[idx] = 1
        idxes.append(idx)
    return idxes

In [None]:
pos = 0

# m で始まる変数は，Adagrad で用いるメモリ変数。それぞれ，
# mWxh: 入力から中間層への結合係数行列
# mWhh: 中間層へのリカレント結合係数行列
# mWhy: 中間層から出力層への結合係数行列
# mbh: 中間層のバイアス項
# mby: 出力層のバイアス項
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) 

smooth_loss = -np.log(1.0/n_vocab) * seq_len # 時刻 0 での損失値

In [None]:
max_iter = 10 ** 4
interval = max_iter >> 2
losses = []
for itr in range(max_iter):
    
    # prepare inputs (we're sweeping from left to right in steps seq_len long)
    if pos + seq_len + 1 >= len(data) or itr == 0: 
        hprev = np.zeros((n_hid,1)) # reset RNN memory
        pos = 0 # go from start of data
    inputs = [chr2idx[ch] for ch in data[pos:pos+seq_len]]
    targets = [chr2idx[ch] for ch in data[pos+1:pos+seq_len+1]]

    # sample from the model now and then
    if itr % interval == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ''.join(idx2chr[ix] for ix in sample_ix)
        print(f'--- 反復訓練数={itr} ---\n{txt}\n---')

    # forward seq_len characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = loss_f(inputs, targets, hprev)
    losses.append(loss)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    if itr % interval == 0:
        print(f'反復学習回数:{itr:05d} 損失値:{smooth_loss:.3f}') # print progress

    # perform parameter update with Adagrad
    for param, _delta, _Hessian in zip([Wxh, Whh, Why, bh, by], 
                                       [dWxh, dWhh, dWhy, dbh, dby], 
                                       [mWxh, mWhh, mWhy, mbh, mby]):
        _Hessian += _delta * _delta
        param += -lr * _delta / np.sqrt(_Hessian + 1e-8) # adagrad update

    pos += seq_len # move data pointer

    
hprev = np.zeros((n_hid,1)) # reset RNN memory
sample_ix = sample(hprev, inputs[0], 200)
txt = ''.join(idx2chr[ix] for ix in sample_ix)
print(f'----\n{txt}\n----')
plt.plot(losses)

In [None]:

Wxh = np.load('2021_0426Wxh.npy')
Whh = np.load('2021_0426Whh.npy')
Why = np.load('2021_0426Why.npy')
bh = np.load('2021_0426bh.npy')
by = np.load('2021_0426by.npy')

hprev = np.zeros((n_hid,1)) # reset RNN memory
sample_ix = sample(hprev, inputs[0], 200)
txt = ''.join(idx2chr[ix] for ix in sample_ix)
print(f'----\n{txt}\n----')

In [None]:
idx2chr[inputs[0]]

- source: <https://levelup.gitconnected.com/8-built-in-functions-every-python-programmer-should-know-3552eb768894>

1. hash()
The `hash()` method is used to return the hash value of an object if it has one. 
Hash values are integer numbers that are used to compare dictionary keys during a dictionary lookup.


2. map()
The `map()` function allows you to execute a specified function for each item in an iterable that it takes as input(both function and iterable).

6. ord()
This function is used to return the Unicode code point of a given character. 
The ord() function takes a character as input and then returns an integer number representing the given input character’s Unicode code point.

7. dir()
`dir()` is a powerful python built-in function, that returns a valid list of all the attributes of the specified object. 
It returns all the properties, even built-in properties that are the default for all objects.

In [None]:
import numpy as np
import sys
import os

# 表示精度桁数の設定
#np.set_printoptions(suppress=False, formatter={'float': '{:7.4f}'.format})
np.set_printoptions(suppress=False, formatter={'float': '{:6.3f}'.format})

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import SVG, display
#display(SVG(filename='../figures/2018Roelofs_fig1.svg'))
#display(SVG(filename='../figures/2018Roelofs_fig3.svg'))

#print('概念モデル')
#display(SVG(url='https://raw.githubusercontent.com/project-ccap/project-ccap.github.io/master/figures/2018Roelofs_fig3.svg'))
