<a href="https://colab.research.google.com/github/ShinAsakawa/ShinAsakawa.github.io/blob/master/2022notebooks/2021_0709torch_rnn_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RNN デモ

- date: 2021_0709
- filename: 2021_0709torch_rnn.pynb
- author: 浅川伸一
- 概要: オノマトペの音韻表現を RNN で

In [None]:
# 形態素分析ライブラリーMeCab と 辞書(mecab-ipadic-NEologd)のインストール 
# reference: https://qiita.com/jun40vn/items/78e33e29dce3d50c2df1
!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab #> /dev/null
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git #> /dev/null 
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n #> /dev/null 2>&1
!pip install mecab-python3 # > /dev/null

# シンボリックリンクによるエラー回避
!ln -s /etc/mecabrc /usr/local/etc/mecabrc

!pip install --upgrade xlrd

In [None]:
#必要なファイルの取得
!wget https://raw.githubusercontent.com/ShinAsakawa/ShinAsakawa.github.io/master/2021code/ja_util.py -O ja_util.py
!wget https://raw.githubusercontent.com/ShinAsakawa/ShinAsakawa.github.io/master/2021code/torch_rnn.py -O torch_rnn.py
!wget https://raw.githubusercontent.com/ShinAsakawa/ShinAsakawa.github.io/master/2021code/2021-0325日本語オノマトペ辞典4500より.xls -O 2021-0325日本語オノマトペ辞典4500より.xls

#note https://raw.github.com/<username>/<repo>/<branch>/some_directory/file.rb

!pip install jaconv

import ja_util
import torch_rnn

In [None]:
import os
import sys
import numpy as np
np.random.seed(42)

import time
import jaconv

import torch
import torch.nn

In [None]:
from google.colab import files
files.upload()  # ご自身の PC からファイルをアップロードして下さい"

In [None]:
import pandas as pd
import os

#注意: '日本語オノマトペ辞典4500より.xls' は著作権の問題があり，公にできません。
#そのため Google Colab での解法，ローカルファイルよりアップロードする
#from google.colab import files
#uploaded = files.upload()  # ここで `日本語オノマトペ辞典4500より.xls` を指定してアップロードする
ccap_base = '.'
onomatopea_excel = '2021-0325日本語オノマトペ辞典4500より.xls'
onmtp2761 = pd.read_excel(os.path.join(ccap_base, onomatopea_excel), sheet_name='2761語')
onomatopea = list(set(sorted(onmtp2761['オノマトペ'])))

In [None]:
#データの準備
mora_dict = {}
roman_dict = {}
for word in onomatopea:
    mora = ja_util.mora_wakati().parse(word)
    roman = ja_util.mora_wakati().parse2romaji(word)
    roman_phon = "".join(ch for ch in roman)
    #print(word, roman, mora)
    #sys.exit()
    if len(mora) > 0:
        mora_dict[word] = {'mora':mora, 'roman':roman, 'roman_phon':roman_phon}
        roman_dict[roman_phon] = {'word': word, 'mora': mora, 'roman': roman}

class Dictionary(object):
    def __init__(self):
        self.wrd2idx = {}  #単語を単語インデックスへ変換する辞書
        self.idx2wrd = []  #単語インデックスから単語へ変換するための配列

    def add_word(self, word):
        if word not in self.wrd2idx:
            self.idx2wrd.append(word)
            self.wrd2idx[word] = len(self.idx2wrd) - 1
        return self.wrd2idx[word]

    def __len__(self):
        return len(self.idx2wrd)


class onomatopea_Corpus(object):
    def __init__(self, train_dataset, valid_dataset, test_dataset):
        self.dictionary = Dictionary()
        self.dictionary.add_word('<eow>')

        self.train = self.tokenize(train_dataset)
        self.valid = self.tokenize(valid_dataset)
        self.test = self.tokenize(test_dataset)

    def tokenize(self, dataset):
        """Tokenizes a dataset."""

        for word in dataset:
            #chrs = word.split() + ['<eow>']
            for ch in word:
                self.dictionary.add_word(ch)

        # Tokenize the content
        idss, ids = [], []
        for word in dataset:
            #chrs = word.split() + ['<eow>']
            ids = []
            #for ch in chrs:
            for ch in word:
                ids.append(self.dictionary.wrd2idx[ch])
            ids.append(self.dictionary.wrd2idx['<eow>'])
            idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)
        return ids

onmtp_phon = [mora_dict[x]['roman_phon'] for x in mora_dict]
    
#データのシャッフル
onmtp_phon = list(np.random.permutation(onmtp_phon))
# # 10 分割して train, test, valid 8:1:1 で分割する
# split = len(onmtp_phon) // 10
# onmtp_phon_test  = onmtp_phon[:split]
# onmtp_phon_valid = onmtp_phon[split:2*split]
# onmtp_phon_train = onmtp_phon[2*split:]
    
# 10 分割して train, test, valid 8:1:1 で分割する
split = len(onmtp_phon) // 10
ono_test  = onmtp_phon[:split]
ono_valid = onmtp_phon[split:2*split]
ono_train = onmtp_phon[2*split:]

onomatopea_corpus = onomatopea_Corpus(ono_train, ono_train, ono_valid)

In [None]:
#上記データの準備の確認作業
# n 個のデータを表示して確認
n = 10 
print([(roman_dict[word]['word'],roman_dict[word]['roman']) for word in ono_train[:n]])
    
for idx in onomatopea_corpus.valid[:n]:
    print(f'{idx}:{onomatopea_corpus.dictionary.idx2wrd[idx]}', end=" ")

print(onomatopea[:n])
print(ono_train[:n])



In [None]:
# -*- coding: utf-8 -*-
# sorce: study/2020pytorch_examples.git/word_language_model/train.py
import torch_rnn
import math
import random

class onomatopea_rnn_model(object):
    """
    以下の実習で指定可能なオプションについて

    * 比較のための model_name として指定できる選択肢:
        - LSTM: 既定値
        - GRU
        - RNN_TANH
        - RNN_RELU
        - Transformer

    * lr: float
        学習係数: 既定値 20
    
    * emsize: int
        埋め込み次元数 既定値 32
    
    * nlayers: int
        層数 既定値:2
    
    * nhid: int
        中間層のニューロン数: 既定値 32

    * nhead: int
        多頭注意のヘッド数: 既定値 2
        Transformer で使用
    
    * clip: float
        勾配爆発抑制のため: 既定値 1.0
    
    * dropout: float
        ドロップアウト率 既定値 0

    * save_filename: string
        学習結果を保存するファイル名 既定値 'onomatopea_model.pt'
    
    * tied: boolean
        入出力のマッピングに同じ結合係数を用いるか否か 既定値 Flase
    
    * seed: int
        乱数の種 結果の再現性保証のため，既定値 111
    
    * log_interval: int
        途中経過の表示間隔
    
    * dry_run: boolean
        デバッグ用
    """

    def __init__(self, corpus, model_name='LSTM', 
                 emsize=32, nhid=32, nlayers=2, lr=20, clip=1,
                 batch_size=32, epochs=10 ** 3, bptt=10, dropout=0,
                 tied=False, seed=111, log_interval=None,
                 save_filename='onomatopea_model.pt',
                 nhead=2, cuda=False, dry_run=False):
        torch.manual_seed(seed)
        random.seed(seed)
        np.random.seed(seed)

        self.corpus = corpus
        self.ntokens = len(corpus.dictionary)
        self.device = torch.device("cuda" if cuda else "cpu")
        self.batch_size = 32
        self.train_data = self.batchify(self.corpus.train, self.batch_size)
        self.val_data = self.batchify(self.corpus.valid, self.batch_size)
        self.test_data = self.batchify(self.corpus.test, self.batch_size)

        self.n_tokens = len(self.corpus.dictionary)
        print(f'n_tokens:{self.n_tokens}')

        self.emsize = emsize # 32 * 2
        self.nhid = nhid # 32 * 2
        self.nlayers = nlayers # 2
        #lr = 20
        self.lr = lr # 1e-1
        #clip = 0.25
        self.clip = clip   # 5
        self.epochs = epochs # 10 ** 3
        #epochs = 2
        self.batch_size = batch_size # 32
        self.bptt = bptt # 10
        self.dropout = dropout # 0.2
        self.tied = tied #True
        self.seed = seed #111
        self.cuda = cuda # False
        if not log_interval:
            self.log_interval = self.epochs >> 3
        else:
            self.log_interval = log_interval
        self.save_filename = save_filename # '2021_0623onomatopea_LSTM_model.pt'
        #onnx_export = onnx'' # 
        self.nhead = nhead # 2
        self.dry_run = dry_run # False
        self.model_name = model_name #'LSTM'
        #model_name = 'RNN_TANH'
        print(f'model_name:{model_name}')
        #type of model_name: RNN_TANH, RNN_RELU, LSTM, GRU, Transformer

        if self.model_name == 'Transformer':
            self.model = torch_rnn.TransformerModel(self.n_tokens, self.emsize, 
                                                    self.nhead, self.nhid, self.nlayers, 
                                                    self.dropout).to(self.device)
        else:
            self.model = torch_rnn.RNNModel(self.model_name, self.n_tokens, self.emsize, 
                                            self.nhid, self.nlayers, self.dropout, 
                                            self.tied).to(self.device)

        self.criterion = torch.nn.NLLLoss()


    def batchify(self, data, bsz):
        # Work out how cleanly we can divide the dataset into bsz parts.
        nbatch = data.size(0) // bsz
        # Trim off any extra elements that wouldn't cleanly fit (remainders).
        data = data.narrow(0, 0, nbatch * bsz)
        # Evenly divide the data across the bsz batches.
        data = data.view(bsz, -1).t().contiguous()
        return data.to(self.device)


    def repackage_hidden(self, h):
        """Wraps hidden states in new Tensors, to detach them from their history."""

        if isinstance(h, torch.Tensor):
            return h.detach()
        else:
            return tuple(self.repackage_hidden(v) for v in h)

        
    def get_batch(self, source, i):
        """
        get_batch subdivides the source data into chunks of length bptt.
        If source is equal to the example output of the batchify function, with
        a bptt-limit of 2, we'd get the following two Variables for i = 0:
        ┌ a g m s ┐ ┌ b h n t ┐
        └ b h n t ┘ └ c i o u ┘
        Note that despite the name of the function, the subdivison of data is not
        done along the batch dimension (i.e. dimension 1), since that was handled
        by the batchify function. The chunks are along dimension 0, corresponding
        to the seq_len dimension in the LSTM.
        """
        seq_len = min(self.bptt, len(source) - 1 - i)
        data = source[i:i+seq_len]
        target = source[i+1:i+1+seq_len].view(-1)
        return data, target


    def evaluate(self, dataset):
        # Turn on evaluation mode which disables dropout.
        self.model.eval()
        self.total_loss = 0.
        if self.model_name != 'Transformer':
            self.hidden = self.model.init_hidden(self.batch_size)
        with torch.no_grad():
            for i in range(0, dataset.size(0) - 1, self.bptt):
                data, targets = self.get_batch(dataset, i)
                if self.model_name == 'Transformer':
                    self.output = self.model(data)
                    self.output = self.output.view(-1, self.n_tokens)
                else:
                    self.output, self.hidden = self.model(data, self.hidden)
                    self.hidden = self.repackage_hidden(self.hidden)
                self.total_loss += len(data) * self.criterion(self.output, targets).item()
        return self.total_loss / (len(dataset) - 1)


    def train(self):
        # At any point you can hit Ctrl + C to break out of training early.
        # Loop over epochs.
        self.best_val_loss = None
        self.interval = self.epochs >> 2
        epoch_start_time = time.time()
        for epoch in range(1, self.epochs+1):
            self._train(epoch)
            val_loss = self.evaluate(self.val_data)
            if epoch % self.interval == 0:
                time_elasped = time.time() - epoch_start_time
                print(f'エポック: {epoch:>3d} ',
                      f'(経過時間:{time_elasped:5.2f}s) ',
                      f'検証データ損失:{val_loss:5.2f} ',
                      f' 検証データ錯綜度(パープレキシティ):{math.exp(val_loss):8.2f}')
                epoch_start_time = time.time()
            # Save the model if the validation loss is the best we've seen so far.
            if not self.best_val_loss or val_loss < self.best_val_loss:
                with open(self.save_filename, 'wb') as f:
                    torch.save(self.model, f)
                    self.best_val_loss = val_loss
            else:
                # Anneal the learning rate if no improvement has been seen in the validation dataset.
                self.lr /= 4.0

                
    def test_model(self):
        # Load the best saved model.
        with open(self.save_filename, 'rb') as f:
            self.model = torch.load(f)
            # after load the rnn params are not a continuous chunk of memory
            # this makes them a continuous chunk, and will speed up forward pass
            # Currently, only rnn model supports flatten_parameters function.
            if self.model_name in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
                self.model.rnn.flatten_parameters()

        # Run on test data.
        self.test_loss = self.evaluate(self.test_data)

        print(f'テストデータ損失 {test_loss:5.2f} ',
              f' | テストデータ錯綜度(パープレキシティ) {math.exp(test_loss):8.2f}')
        

    def _train(self, epoch):
        # Turn on training mode which enables dropout.
        self.model.train()
        self.total_loss = 0.
        self.start_time = time.time()
        if self.model_name != 'Transformer':
            self.hidden = self.model.init_hidden(self.batch_size)
        for batch, i in enumerate(range(0, self.train_data.size(0) - 1, self.bptt)):
            data, targets = self.get_batch(self.train_data, i)
            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            self.model.zero_grad()
            if self.model_name == 'Transformer':
                self.output = self.model(data)
                self.output = self.output.view(-1, self.ntokens)
            else:
                self.hidden = self.repackage_hidden(self.hidden)
                self.output, self.hidden = self.model(data, self.hidden)
            loss = self.criterion(self.output, targets)
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
            for p in self.model.parameters():
                p.data.add_(p.grad, alpha=-self.lr)

            self.total_loss += loss.item()

            if self.dry_run:
                break
                

In [None]:
#訓練モデルのパラメータについては，上記の関数をみてください
model1 = onomatopea_rnn_model(onomatopea_corpus, model_name='LSTM', lr=1, epochs=100, emsize=32, nhid=32, nlayers=2)
model1.train()