<a href="https://colab.research.google.com/github/ShinAsakawa/ShinAsakawa.github.io/blob/master/2023notebooks/2023_0302terao_speech_erros_vdrj20k_h64.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 寺尾先生の言い誤りデータ `成人　音交換　機械学習.xlsx` を用いて，言い誤りを微調整

* date: 2023_0306
* filename: 2023_0302terao_speech_erros_vdrj20k_h64.ipynb

## 0.1. 下準備

In [None]:
%config InlineBackend.figure_format = 'retina'
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from itertools import chain

from IPython import get_ipython
isColab =  'google.colab' in str(get_ipython())

import math
import random
import numpy as np
import time
import gzip
import json
import sys
import re
import json

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

try:
    import jaconv
except ImportError:
    !pip install jaconv
    import jaconv

if isColab:

    # termcolor を downgrade しないと colab ではテキストに色がつかない
    !pip install --upgrade termcolor==1.1
    import termcolor

    # 結果を保存するために Google Drive をマウントする
    #import google.colab
    #google.colab.drive.mount('/content/drive/')

    # GPU 情報を表示
    #!nvidia-smi -L
    #!pip install ipynbname --upgrade > /dev/null
    !pip install japanize_matplotlib

if isColab:
    # colab 上で MeCab を動作させるために，C コンパイラを起動して，MeCab の構築を行う
    # そのため時間がかかる。
    !apt install aptitude
    !aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
    !pip install mecab-python3==0.7
    !pip install jaconv
    
    import MeCab
    mecab_wakati = MeCab.Tagger('-Owakati').parse
    mecab_yomi = MeCab.Tagger('-Oyomi').parse
    
else:
    from ccap.mecab_settings import yomi as mecab_yomi
    from ccap.mecab_settings import wakati as mecab_wakati


# ここから下は，コード実行に関するバージョン情報などの情報源の取得と表示\n",
from termcolor import colored
import platform
HOSTNAME = platform.node().split('.')[0]

import os
HOME = os.environ['HOME']

try:
    import ipynbname
except ImportError:
    !pip install ipynbname
    import ipynbname
FILEPATH = str(ipynbname.path()).replace(HOME+'/','')

import pwd
USER=pwd.getpwuid(os.geteuid())[0]

from datetime import date
TODAY=date.today()

import torch
TORCH_VERSION = torch.__version__

color = 'green'
print('日付:',colored(f'{TODAY}', color=color, attrs=['bold']))
print('HOSTNAME:',colored(f'{HOSTNAME}', color=color, attrs=['bold']))
print('ユーザ名:',colored(f'{USER}', color=color, attrs=['bold']))
print('HOME:',colored(f'{HOME}', color=color,attrs=['bold']))
print('ファイル名:',colored(f'{FILEPATH}', color=color, attrs=['bold']))
print('torch.__version__:',colored(f'{TORCH_VERSION}', color=color, attrs=['bold']))

# 自作ライブラリの読み込み
if isColab:
    !git clone https://github.com/ShinAsakawa/RAM.git 


## 1. `RAM/terao_speech_error` データセットの読み込み

事前訓練は `RAM/2023_0302vdrj_20k_p2p_h64.pt` に保存されている

In [10]:
# シミュレーションに必要なパラメータの設定ユーティリティ
from RAM import set_params_from_file
from RAM import set_params_from_config

from termcolor import colored

# シミュレーションに必要なパラメータの設定
configs = {
    'dataset_name'  : 'vdrj',   # ['pyslex71', 'vdrj', 'onechar', 'fushimi1999']
    'traindata_size':  10000,    # 訓練データ (語彙) 数，
    'traindata_ratio': 0.9,     # 訓練データと検証データを分割する比率。ただし onechar データセットでは無効
    'stop_list': None,
    'epochs': 100,               # 学習のためのエポック数
    'lr': 1e-3,                       # 学習率
    
    # 以下 `source` と `rget` を定義することで，別の課題を実行可能
    'source': 'phon',          # ['orth', 'phon']
    'target': 'phon',          # ['orth', 'phon']
    'hidden_size': 64,        # 中間層のニューロン数

    'dropout_p': 0.0,                 # ドロップアウト率
    'teacher_forcing_ratio': 0.5,     # 教師強制を行う確率
    'optim_func': "torch.optim.Adam",   # 最適化アルゴリズム ['torch.optim.Adam', 'torch.optim.SGD', 'torch.optim.AdamW']
    'loss_func' :"torch.nn.NLLLoss",  # 負の対数尤度損失 ['torch.nn.NLLLoss()', or 'torch.nn.CrossEntropyLoss()']
    #'loss_func' :torch.nn.NLLLoss(),

    'random_seed': 42,          # 乱数の種。ダグラス・アダムス著「銀河ヒッチハイカーズガイド」
    'pretrained': 'RAM/2023_0302vdrj_20k_p2p_h64.pt',
    #'isTrain'   : True,       # True であれば学習する
    'verbose'   : True,
    
    # 学習済のモデルパラメータを保存するファイル名
    'path_saved': '2023_0302tera_speech_errors_vdrj_20k_p2p_h64.pt', 
}


X = set_params_from_config(configs=configs, device=device)
configs = X['params']
configs['dataset_name'] = 'terao_speech_error'
encoder = X['encoder']
decoder = X['decoder']
ds = X['dataset']
configs['dataset'] = ds
torch.manual_seed(configs['random_seed'])

# encoder_optimizer = X['encoder_optimizer']
# decoder_optimizer = X['decoder_optimizer']
N_train = X['N_train']
N_val   = X['N_val']

for k, v in sorted(configs.items()):
    print(k, colored(v, 'blue', attrs=['bold']))

dataset [1m[34m<RAM.dataset.VDRJ_Dataset object at 0x7fa444e2e850>[0m
dataset_name [1m[34mterao_speech_error[0m
dropout_p [1m[34m0.0[0m
epochs [1m[34m100[0m
hidden_size [1m[34m64[0m
loss_func [1m[34mNLLLoss()[0m
lr [1m[34m0.001[0m
optim_func [1m[34m<class 'torch.optim.adam.Adam'>[0m
path_saved [1m[34m2023_0302tera_speech_errors_vdrj_20k_p2p_h64.pt[0m
pretrained [1m[34mRAM/2023_0302vdrj_20k_p2p_h64.pt[0m
random_seed [1m[34m42[0m
source [1m[34mphon[0m
stop_list [1m[34mNone[0m
target [1m[34mphon[0m
teacher_forcing_ratio [1m[34m0.5[0m
traindata_ratio [1m[34m0.9[0m
traindata_size [1m[34m10000[0m
verbose [1m[34mTrue[0m


In [11]:
# 言い誤りデータの読み込み
from RAM import terao_speech_error_dataset
terao_se_ds = terao_speech_error_dataset()    

In [None]:
from RAM import save_model_and_configs
from RAM import EncoderRNN, AttnDecoderRNN
from RAM import eval_input_seq2seq

X = torch.load(configs['pretrained'])
    
encoder0 = EncoderRNN(
    n_inp=len(ds.source_list),                # 符号化器への入力データ次元数の特徴数 (語彙数): int
    n_hid=configs['hidden_size']).to(device)  # 符号化器の中間層数，埋め込みベクトルとして復号化器へ渡される次元数: int

decoder0 = AttnDecoderRNN(
    n_hid=configs['hidden_size'],             # 復号化器の中間層次元数: int
    n_out=len(ds.target_list),                # 復号化器の出力層次元数，入力層の次元と等しいので入力層次元を指定せず: int
    dropout_p=configs['dropout_p'],
    max_length=ds.maxlen).to(device)

encoder0_optimizer = torch.optim.Adam(params=encoder0.parameters(), lr=configs['lr'])
decoder0_optimizer = torch.optim.Adam(params=decoder0.parameters(), lr=configs['lr'])
encoder0.load_state_dict(X['encoder'])
decoder0.load_state_dict(X['decoder'])

#_ = eval_input_seq2seq(encoder=encoder0, decoder=decoder0, ds=ds)

encoder1 = EncoderRNN(
    n_inp=len(ds.source_list),                # 符号化器への入力データ次元数の特徴数 (語彙数): int
    n_hid=configs['hidden_size']).to(device)  # 符号化器の中間層数，埋め込みベクトルとして復号化器へ渡される次元数: int

decoder1 = AttnDecoderRNN(
    n_hid=configs['hidden_size'],             # 復号化器の中間層次元数: int
    n_out=len(ds.target_list),                # 復号化器の出力層次元数，入力層の次元と等しいので入力層次元を指定せず: int
    dropout_p=configs['dropout_p'],
    max_length=ds.maxlen).to(device)

encoder1_optimizer = torch.optim.Adam(params=encoder1.parameters(), lr=configs['lr'])
decoder1_optimizer = torch.optim.Adam(params=decoder1.parameters(), lr=configs['lr'])
encoder1.load_state_dict(X['encoder'])
decoder1.load_state_dict(X['decoder'])

#_ = eval_input_seq2seq(encoder=encoder1, decoder=decoder1, ds=ds)

X = torch.load('RAM/2023_0302vdrj_20k_p2p_h64.pt')
    
encoder2 = EncoderRNN(
    n_inp=len(ds.source_list),                # 符号化器への入力データ次元数の特徴数 (語彙数): int
    n_hid=configs['hidden_size']).to(device)  # 符号化器の中間層数，埋め込みベクトルとして復号化器へ渡される次元数: int

decoder2 = AttnDecoderRNN(
    n_hid=configs['hidden_size'],             # 復号化器の中間層次元数: int
    n_out=len(ds.target_list),                # 復号化器の出力層次元数，入力層の次元と等しいので入力層次元を指定せず: int
    dropout_p=configs['dropout_p'],
    max_length=ds.maxlen).to(device)

encoder2_optimizer = torch.optim.Adam(params=encoder2.parameters(), lr=configs['lr'])
decoder2_optimizer = torch.optim.Adam(params=decoder2.parameters(), lr=configs['lr'])
encoder2.load_state_dict(X['encoder'])
decoder2.load_state_dict(X['decoder'])

#_ = eval_input_seq2seq(encoder=encoder2, decoder=decoder2, ds=ds)

_ds = terao_se_ds
inputs = [v['ひら'] for k, v in _ds.data_dict.items()]
counter = 0
for i, inp in enumerate(inputs):
    tgt = ds.target_ids2tkn(_ds.__getitem__(i)[-1])
    out = eval_input_seq2seq(encoder=encoder0, decoder=decoder0, ds=ds, inp_wrd=inp, isPrint=False)
    yesno = out[0] == tgt
    if yesno:
        color = 'blue'
        counter += 1
    else:
        color = 'red'
    if yesno:
        print(f'{i:3d}: {inp}->/{"".join(ph for ph in out[0][:-1])}/',
              f'{colored(yesno, color,attrs=["bold"])}',
              f' tgt:{"".join(ph for ph in tgt[:-1])}')

p = counter/_ds.data_dict.__len__()
print(f'counter:{counter}/{_ds.data_dict.__len__()} = {p * 100:6.2f} 正しくいい間違えた割合%')

### 議論: 下の最後の出力の最後の数値が語彙判断課題のシミュレーションに使えるのかもしれない，という妄想はどう思うか？


In [None]:
outwrd, l = eval_input_seq2seq(encoder=encoder0, decoder=decoder0, ds=ds, isPrint=False)
print(" ".join(p for p in outwrd[:-1]), np.exp(np.array(l)))

## 0 model0 (encoder0, decoder0) を訓練
`model0` は，純粋微調整

In [None]:
from RAM import train_epochs
from RAM import eval_input_seq2seq

# model0 (encoder0, decoder0) を訓練
losses = train_epochs( 
    epochs=configs['epochs'], 
    lr=configs['lr'],
    train_dataset=terao_se_ds,
    val_dataset={'terao_sp_ds': terao_se_ds},
    encoder=encoder0, decoder=decoder0,
    encoder_optimizer=encoder0_optimizer, decoder_optimizer=decoder0_optimizer,
    source_vocab=ds.source_list, target_vocab=ds.target_list,
    source_ids=ds.source, target_ids=ds.target,
    criterion=configs['loss_func'],
    params=configs,
    device=device,
    max_length=ds.maxlen,
    #n_sample=0,
    teacher_forcing_ratio=configs['teacher_forcing_ratio'],
)

plt.plot(losses) 

_ds = terao_se_ds
inputs = [v['ひら'] for k, v in _ds.data_dict.items()]
counter = 0
for i, inp in enumerate(inputs):
    tgt = ds.target_ids2tkn(_ds.__getitem__(i)[-1])
    out = eval_input_seq2seq(encoder=encoder0, decoder=decoder0, ds=ds, inp_wrd=inp, isPrint=False)
    yesno = out[0] == tgt
    if yesno:
        color = 'blue'
        counter += 1
    else:
        color = 'red'
    if not yesno:
        print(f'{i:3d}: {inp}->/{"".join(ph for ph in out[0][:-1])}/',
              f'{colored(yesno, color,attrs=["bold"])}',
              f' tgt:{"".join(ph for ph in tgt[:-1])}')

p = counter/_ds.data_dict.__len__()
print(f'counter:{counter}/{_ds.data_dict.__len__()} = {p * 100:6.2f}')

In [None]:
for k, v in sorted(X.items()):
    if isinstance(v, dict):
        print(k, len(v))
    else:
        print(k, v)

X.keys()

In [None]:
#_ds.data_dict
for i in tqdm(range(ds.__len__()>>factor)):
    _inp, _tch = ds.__getitem__(i)
    lex = ds.source_ids2tkn(_inp)
    wrd = ds.source_ids2tkn(_inp)
    kana = ds.data_dict[i]['yomi']
    hira = jaconv.kata2hira(kana)
    out = eval_input_seq2seq(encoder=encoder0, decoder=decoder0, ds=ds, inp_wrd=hira, isPrint=False)
    
    _tch_wrd = "".join(c for c in ds.target_ids2tkn(_tch)[:-1])
    _out_wrd = "".join(c for c in out[0][:-1])
    yesno = _out_wrd == _tch_wrd
    if yesno:
        color = 'blue'
        counter += 1
    else:
        color = 'red'
    if not yesno:
        resps.append((i,hira,_out_wrd,_tch_wrd)) 
        #print(f'{i:4d} {hira}->/{_out_wrd}/({_tch_wrd})', 
        #      f'{colored(yesno, color,attrs=["bold"])}')
              
p = counter/(ds.__len__()>>factor)
print(f'counter:{counter}/{ds.__len__()>>factor}={p:6.3f}')

## 1 パラメータの一部を凍結させて，転移学習 `model1` GRU を訓練可能とし，注意を凍結

In [None]:
def freeze_enc_dec_param(encoder:torch.nn.Module=encoder,
                         decoder:torch.nn.Module=decoder,
                         attn_flg:bool=True,
                         gru_flg:bool=False,
                        ):
                         
    encoder_parameters = {name:param for name, param in encoder.named_parameters()}
    encoder_modules = {name:param for name, param in encoder.named_modules()}

    decoder_parameters = {name:param for name, param in decoder.named_parameters()}
    decoder_modules = {name:param for name, param in decoder.named_modules()}

    # 転移学習で学習させるパラメータを、変数params_to_updateに格納する
    params_to_update = {}
    params_not_to_update = {}

    # 学習させるパラメータ名
    if attn_flg:
        update_param_names = ["attn.bias", "attn.weight", "attn_combine.bias", "attn_combine.weight"]
    elif gru_flg:
        update_param_names = ['gru.weight_ih_l0', 'gru.weight_hh_l0', 'gru.bias_ih_l0', 'gru.bias_hh_l0']
    else:
        update_param_names = []

    # 学習させるパラメータ以外は勾配計算をなくし、変化しないように設定
    for name, param in decoder.named_parameters():
        if name in update_param_names:
            param.requires_grad = True
            params_to_update[name] = param
        else:
            param.requires_grad = False
            params_not_to_update[name] = param

    for name, param in encoder.named_parameters():
        param.requires_grad = True
        params_to_update[name] =  param
                    
    return encoder, decoder, params_to_update


# model1 は GRU を訓練可能とし，attenion を fix
encoder1, decoder1, _params_to_update = freeze_enc_dec_param(
    encoder1, decoder1, 
    attn_flg=False,
    gru_flg=True)
print(_params_to_update.keys())

### 1.1 再訓練の実施

In [None]:
from RAM import train_epochs

# model1 (encoder1, decoder1) を訓練
losses = train_epochs( 
    epochs=100, # configs['epochs'], 
    lr=configs['lr'],
    train_dataset=terao_se_ds,
    val_dataset={'terao_sp_ds': terao_se_ds},
    encoder=encoder1, decoder=decoder1,
    encoder_optimizer=encoder1_optimizer, decoder_optimizer=decoder1_optimizer,
    source_vocab=ds.source_list, target_vocab=ds.target_list,
    source_ids=ds.source, target_ids=ds.target,
    criterion=configs['loss_func'],
    params=configs,
    device=device,
    max_length=ds.maxlen,
    #n_sample=0,
    teacher_forcing_ratio=configs['teacher_forcing_ratio'],
)

plt.plot(losses) 

_ds = terao_se_ds
inputs = [v['ひら'] for k, v in _ds.data_dict.items()]
counter = 0
for i, inp in enumerate(inputs):
    tgt = ds.target_ids2tkn(_ds.__getitem__(i)[-1])
    out = eval_input_seq2seq(encoder=encoder1, decoder=decoder1, ds=ds, inp_wrd=inp, isPrint=False)
    yesno = out[0] == tgt
    if yesno:
        color = 'blue'
        counter += 1
    else:
        color = 'red'
    if not yesno:
        print(f'{i:3d}: {inp}->/{"".join(ph for ph in out[0][:-1])}/',
              f'{colored(yesno, color,attrs=["bold"])}',
              f' tgt:{"".join(ph for ph in tgt[:-1])}')

p = counter/_ds.data_dict.__len__()
        
print(f'counter:{counter}/{_ds.data_dict.__len__()} = {p * 100:6.2f}')    

In [None]:
outwrd, l = eval_input_seq2seq(encoder=encoder1, decoder=decoder1, ds=ds, isPrint=False)
print(" ".join(p for p in outwrd[:-1]), np.exp(np.array(l)))

## 2 パラメータの一部を凍結させて，転移学習 `model2` GRU を凍結し，注意を訓練可能とする

In [None]:
# model2 は GRU を訓練可能とし，attenion を fix
encoder2, decoder2, _params_to_update = freeze_enc_dec_param(
    encoder2, decoder2, 
    attn_flg=True,
    gru_flg=False)
print(_params_to_update.keys())

### 2.1. 再訓練の実施

In [None]:
# model2 は GRU を fix, attention を訓練可能
encoder2, decoder2, _params_to_update = freeze_enc_dec_param(
    encoder2, decoder2, 
    attn_flg=True,
    gru_flg=False)
print(_params_to_update.keys())

# model2 (encoder2, decoder2) を訓練
losses = train_epochs( 
    epochs=configs['epochs'], 
    lr=configs['lr'],
    train_dataset=terao_se_ds,
    val_dataset={'terao_sp_ds': terao_se_ds},
    encoder=encoder2, decoder=decoder2,
    encoder_optimizer=encoder2_optimizer, decoder_optimizer=decoder2_optimizer,
    source_vocab=ds.source_list, target_vocab=ds.target_list,
    source_ids=ds.source, target_ids=ds.target,
    criterion=configs['loss_func'],
    params=configs,
    device=device,
    max_length=ds.maxlen,
    #n_sample=0,
    teacher_forcing_ratio=configs['teacher_forcing_ratio'],
)

plt.plot(losses) 

_ds = terao_se_ds
inputs = [v['ひら'] for k, v in _ds.data_dict.items()]
counter = 0
for i, inp in enumerate(inputs):
    tgt = ds.target_ids2tkn(_ds.__getitem__(i)[-1])
    out = eval_input_seq2seq(encoder=encoder2, decoder=decoder2, ds=ds, inp_wrd=inp, isPrint=False)
    yesno = out[0] == tgt
    if yesno:
        color = 'blue'
        counter += 1
    else:
        color = 'red'
    if not yesno:
        print(f'{i:3d}: {inp}->/{"".join(ph for ph in out[0][:-1])}/',
              f'{colored(yesno, color,attrs=["bold"])}',
              f' tgt:{"".join(ph for ph in tgt[:-1])}')

p = counter/_ds.data_dict.__len__()
        
print(f'counter:{counter}/{_ds.data_dict.__len__()} = {p * 100:6.2f}')    

In [None]:
outwrd, l = eval_input_seq2seq(encoder=encoder2, decoder=decoder2, ds=ds, isPrint=False)
print(" ".join(p for p in outwrd[:-1]), np.exp(np.array(l)))


フルの微調整の場合
```
  1: けが->/keka/ False  tgt:kage
  8: こと->/koko/ False  tgt:toko
 20: はまなこ->/chaNanako/ False  tgt:hanamako
 21: こっきょう->/koqko:/ False  tgt:kyoqko:
 30: のーとるだむ->/no:rutodamu/ False  tgt:no:tomudaru
 37: こうしょきょうふしょう->/ko:shoko:fusho:/ False  tgt:kyo:shoko:fusho:
 42: かんとくさんにん->/kaNtokukaNniN/ False  tgt:saNtokukaNniN
 46: ななねんめ->/nanameNne/ False  tgt:shichimeNne
 61: はいるあて->/karuhate/ False  tgt:airuhate
 64: あんぜんうんてん->/aNzeNaNteN/ False  tgt:uNzeNaNteN
 65: けいひん->/keikiN/ False  tgt:heikiN
 85: かさま->/kakama/ False  tgt:sakama
 90: かっぷ->/kyaqku/ False  tgt:paqku
103: こと->/koko/ False  tgt:toko
128: かってしったる->/kaqtekaqtaru/ False  tgt:shiqtekaqtaru
140: たどって->/tadaq/ False  tgt:todaq
counter:128/144 =  88.89
```

gru_flg = True の場合，すなわち attn_flg はフリーズされる場合

```
 12: あがつま->/agamama/ False  tgt:agamatsu
 15: とつぎさき->/tutogisaki/ False  tgt:tsutogisaki
 19: うらない->/urarai/ False  tgt:unarai
 20: はまなこ->/chanamako/ False  tgt:hanamako
 29: まほめっと->/mameNoqto/ False  tgt:mamehoqto
 31: のーとるだむ->/no:tomudaru/ False  tgt:no:rutodamu
 36: ほっとちょこ->/choqtochoko/ False  tgt:choqtohoko
 38: さようはんさよう->/zayo:saNsayo:/ False  tgt:hayo:saNsayo:
 41: かんこうきょうかい->/kaNko:ko:kai/ False  tgt:kaNkyo:ko:kai
 42: かんとくさんにん->/kaNtokukaNniN/ False  tgt:saNtokukaNniN
 45: さいもんとがーふぁんくる->/gaimoNtosa:furoNu/ False  tgt:gaimoNtosa:faNkuru
 59: あけわたしました->/aakeatashimashita/ False  tgt:wakeatashimashita
 60: たてのかいてん->/tatenotaiteN/ False  tgt:katenotaiteN
 62: びじん->/jijiN/ False  tgt:jibiN
 63: ほっとちょこれーと->/choqtochokore:to/ False  tgt:choqtohokore:to
 64: あんぜんうんてん->/aNzeNaNteN/ False  tgt:uNzeNaNteN
 77: みとめてもらう->/mitomo/ False  tgt:mitemo
 80: くうこう->/ko:ko:/ False  tgt:ko:ku:
 85: かさま->/kakama/ False  tgt:sakama
 86: おおわらわ->/o:wara/ False  tgt:o:rawa
 88: Eかっぷ->/mypaqku/ False  tgt:<UNK>paqku
105: ばななわにえん->/bananabanieN/ False  tgt:wananabanieN
118: うらわ->/uwaka/ False  tgt:uwara
119: はをかって->/aaohaqte/ False  tgt:kaohaqte
126: せんきょけっか->/seNkyoseqka/ False  tgt:keNkyoseqka
130: さいたはな->/kaitasana/ False  tgt:haitasana
132: うけとめて->/ututomere/ False  tgt:utoke
140: たどって->/tadaq/ False  tgt:todaq
141: たのしませて->/tanoe:ashi/ False  tgt:tanomashi
counter:115/144 =  79.86
```
