<a href="https://colab.research.google.com/github/ShinAsakawa/ShinAsakawa.github.io/blob/master/2025notebooks/2025_0713CDP%2Bja.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. 準備作業

<img src="https://raw.githubusercontent.com/project-ccap/project-ccap.github.io/refs/heads/master/2025figs/1998Zorzi_CDP_fig1.svg" style="width:49%;"><br/>
<p>Zorzi+(1998) Fig.1 Architecture of the model. The arrow means full connectivity between layers. Each box stand for a group of letters (26) or phonemes (44).</p>


<img src="https://raw.githubusercontent.com/project-ccap/project-ccap.github.io/refs/heads/master/2025figs/1998Zorzi_CDP_fig8.svg" width="49%;"><br/>
<p>Zorzi+(1998) Fig.8. Architecture of the model with the hidden layer pathway. In both the direct pathway and the mediated pathway the layers are fully connected (arrows).</p>

<img src="https://raw.githubusercontent.com/project-ccap/project-ccap.github.io/refs/heads/master/2025figs/1998Zorzi_fig10.svg" width="49%"><br/>
<p style="align-text:center">
Figure 10. Lexical and sublexical procedures in reading aloud, and their interaction in the phonological decision system, where the final phonological code is computed for articulation.
</p>


## 0.1 必要なライブラリの輸入

In [None]:
%config InlineBackend.figure_format = 'retina'
import torch
#device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f'device:{device}')

# 必要なライブラリの輸入
from collections import OrderedDict
import sys
import os
import numpy as np
import operator
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

HOME = os.environ['HOME']

from IPython import get_ipython
isColab =  'google.colab' in str(get_ipython())
print(f'isColab:{isColab}')

try:
    import japanize_matplotlib
except ImportError:
    !pip install japanize_matplotlib
    import japanize_matplotlib

try:
    import ipynbname
except ImportError:
    !pip install ipynbname
    import ipynbname

FILEPATH = str(ipynbname.path()).split('/')[-1]
print(f'FILEPATH:{FILEPATH}')

try:
    import CDP_ja
except ImportError:
    !git clone https://github.com/ShinAsakawa/CDP_ja.git
    import CDP_ja

In [None]:
#!rm -rf CDP_ja

## 0.2 NTT 日本語語彙特性 単語頻度データ psylex71.txt のダウンロード

## 0.1 モーラ tokenizer の定義 (モーラ分かち書き)

In [None]:
from CDP_ja import mora_Tokenizer, kunrei_Tokenizer, gakushu_Tokenizer, joyo_Tokenizer
mora_tokenizer = mora_Tokenizer()
kunrei_tokenizer = kunrei_Tokenizer()
gakushu_tokenizer = gakushu_Tokenizer()
joyo_tokenizer = joyo_Tokenizer()

# word = 'アカサタナ'
# ids = kunrei_tokenizer(word)
# print(word, ids, kunrei_tokenizer.decode(ids))
# print(kunrei_tokenizer.wakachi(word))

# # 上記 gakushu_tokenizer の検証
# print(gakushu_tokenizer('学校'))
# print(gakushu_tokenizer.decode(gakushu_tokenizer('学校')))
# print(len(gakushu_tokenizer.tokens), len(mora_tokenizer.tokens),)

# 常用漢字
# from RAM.char_ja import chars_joyo as chars_joyo
# joyo_chars = "".join([ch for ch in chars_joyo().char_list])
# print(joyo_chars)


# 1. NTT 日本語語彙特性 単語頻度データ psylex71 データセットの定義

In [None]:
from CDP_ja import Psylex71_Dataset

psylex71_dss={}
inplen_min=2
for inplen_max in [2,3,4,5]:

    if inplen_max == 2:
        display=True
    else:
        display=False
    psylex71_dss[inplen_max] = Psylex71_Dataset(
        inplen_min=2,
        inplen_max=inplen_max,
        # psylex71_dic=None,
        input_tokenizer=gakushu_tokenizer,
        output_tokenizer=mora_tokenizer,
        device=device,
        display=display)

    print(f'psylex71 最短文字長:{inplen_min}, 最長文字長:{inplen_max}',
          f'データセットサイズ (単語数):{psylex71_dss[inplen_max].__len__():7,d} 語')

# 3. モデルの定義
## 3.1 TLA モデルの定義

In [None]:
# 全モデル共通使用するライブラリの輸入
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

from CDP_ja import vanilla_TLA
from CDP_ja import Seq2Seq_wAtt
from CDP_ja import Seq2Seq_woAtt

## Transformer model の定義

In [None]:
# from RAM import Transformer
# transformer = Transformer(src_vocab_size=len(gakushu_tokenizer.tokens),
#                           tgt_vocab_size=len(mora_tokenizer.tokens),
#                           model_dim=256,
#                           num_heads=4,
#                           num_layers=1,
#                           max_seq_length=psylex71_dss[2].maxlen_out,
#                           #max_seq_length=psylex71_ds.maxlen_out,
#                           dropout=0.,
#                           ff_dim=32,
#                           device=device)
# #).to(device)
# transformer.eval();

# 4. 訓練 (train) データセット，検証 (valid) データセット，検査 (test) データセットへ分割

## 4.1 データセットの選択

In [None]:
# 以下の 2 つのデータセットは出力用トークナイザによって2つに分かれる
psylex71_ds_mora   = Psylex71_Dataset(input_tokenizer=gakushu_tokenizer, output_tokenizer=mora_tokenizer)
psylex71_ds_kunrei = Psylex71_Dataset(input_tokenizer=gakushu_tokenizer, output_tokenizer=kunrei_tokenizer)

# # データセットのチェック
# for _ds in [psylex71_ds_mora, psylex71_ds_kunrei]:
#     for N in np.random.permutation(_ds.__len__())[:5]:
#     #for N in range(3):
#         inp, tgt = _ds.__getitem__(N)
#         print(f'_ds.ids2inp(inp):{_ds.ids2inp(inp)}',
#               f'{inp.cpu().numpy()}',
#               f'_ds.target_ids2target(tgt):{_ds.target_ids2target(tgt)}',
#               f'{tgt.cpu().numpy()}')

## 4.2 データセットの分割,訓練,検査,検証データセット

In [None]:
# データセットの分割,訓練,検査,検証データセット
seed=42

_ds = psylex71_ds_mora
#_ds = psylex71_ds_kunrei

#train_size = int(_ds.__len__() * 0.7)
#train_size = int(_ds.__len__() * 0.5)
#valid_size = _ds.__len__() - train_size
#train_ds, valid_ds = torch.utils.data.random_split(dataset=_ds, lengths=(train_size, valid_size), generator=torch.Generator().manual_seed(seed))
#train_size = int(_ds.__len__() * 0.2)

train_size = int(_ds.__len__() * 0.07)
valid_size = int(_ds.__len__() * 0.03)
resid_size = _ds.__len__() - train_size - valid_size

train_ds, valid_ds, resid_size = torch.utils.data.random_split(
    dataset=_ds,
    lengths=(train_size, valid_size, resid_size),
    generator=torch.Generator().manual_seed(seed))

print(f'train_size:{train_size}')
print(f'valid_size:{valid_size}')

## 4.3 バッチサイズの定義とデータローダの設定

In [None]:
# batch_size = 32
# batch_size = 64
# batch_size = 4096
batch_size = 1024
#batch_size = 512
train_dl = torch.utils.data.DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
valid_dl = torch.utils.data.DataLoader(dataset=valid_ds, batch_size=batch_size, shuffle=False)

def _collate_fn(batch):
    inps, tgts = list(zip(*batch))
    inps = list(inps)
    tgts = list(tgts)
    return inps, tgts

train_dl = torch.utils.data.DataLoader(
    dataset=train_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=_collate_fn)

valid_dl = torch.utils.data.DataLoader(
    dataset=valid_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=_collate_fn)

print(f'train_ds.__len__():{train_ds.__len__()}')
print(f'valid_ds.__len__():{valid_ds.__len__()}')

# # 以下，検証
# _ds = train_ds
# for N in range(2):
#     inp, tgt = _ds.__getitem__(N)
#     print(f'_ds.dataset.ids2inp(inp):{_ds.dataset.ids2inp(inp)}',
#           f'{inp.cpu().numpy()}',
#           f'_ds.dataset.target_ids2target(tgt):{_ds.dataset.target_ids2target(tgt)}',
#           f'{tgt.cpu().numpy()}')

# psylex71_ds.maxlen_out

## 4.4 定義したモデルで動作チェック

In [None]:
# N 個のデータを実行してみる
N = 5
_ds = train_ds
ids = np.random.permutation(_ds.__len__())[:N]  # データをシャフルして先頭の N 項目だけ ids に入れる
input_tokenizer=gakushu_tokenizer
output_tokenizer=mora_tokenizer
ds=train_ds
n_hid=1024

tla_vanilla = vanilla_TLA(inp_vocab_size=len(input_tokenizer.tokens),
                          out_vocab_size=len(output_tokenizer.tokens),
                          inp_len=2,
                          out_len=ds.dataset.maxlen_out,
                          device=device,
                          n_hid=n_hid).to(device)

#print(tla_vanilla.eval())

# モデルを tla に代入
#tla = tla_seq2seq
tla = tla_vanilla
_ds = psylex71_dss[2]
#print(type(psylex71_dss[2]))
#sys.exit()
for idx in ids:
    # データセットから返ってくる値は入力信号 inp と教師信号 tch
    inp, tch = _ds.__getitem__(idx)
    print(f'idx:{idx}:', f'inp:{inp}', f'tch:{tch}')

    # 入出力信号はトークン ID 番号であるため人間が読みやすいように変換して表示
    print(f'_ds.ids2inp({inp}):{_ds.ids2inp(inp)}')
    print(f'_ds.taregt_ids2target({tch}):{_ds.target_ids2target(tch)}')
    inp = pad_sequence(inp.unsqueeze(0), batch_first=True).to(device)
    tch = pad_sequence(tch.unsqueeze(0), batch_first=True).to(device)

    outs = tla(inp, tch)

    print('教師:', _ds.target_ids2target([idx.cpu().numpy() for idx in tch.squeeze(0)]), end=": ")
    print('教師 ids:', [int(_tch.cpu().numpy()) for _tch in tch.squeeze(0)])
    outs = tla(inp,tch)
    print('出力 ids:', [int(_out.argmax().cpu().numpy()) for _out in outs[0]], end="\n===\n")

# 5. 学習

In [None]:
def fit_an_epoch(model:torch.nn.Module=None,
                 optimizer:torch.optim=None,
                 loss_f:torch.nn.modules=None,
                 _dl:torch.utils.data.dataloader.DataLoader=None):

    model.train()  # モデルを訓練モードに変更

    sum_loss=0
    count=0
    N = 0

    for inps, tchs in _dl:
        inps = pad_sequence(inps, batch_first=True).to(device)
        tchs = pad_sequence(tchs, batch_first=True).to(device)
        outs = model(inps, tchs)

        # 正解のカウント
        out_ids = [out.argmax(dim=1) for out in outs]
        for tch, out in zip(tchs[:], out_ids[:]):
            yesno = ((tch==out) * 1).sum().cpu().numpy() == len(tch)
            count += 1 if yesno else 0

        # 学習の実行
        loss = 0.
        optimizer.zero_grad()
        for j in range(len(tchs)):
            loss += loss_f(outs[j],tchs[j])
        loss.backward()  # 損失値の計算
        optimizer.step() # 学習
        sum_loss += loss.item()

        N += len(tchs)
    p_ = count / N
    return {'sum_loss':sum_loss, 'count':count, 'N':N, 'P':p_}
    #return model, {'sum_loss':sum_loss, 'count':count, 'N':N, 'P':p_}

In [None]:
def eval_an_epoch(model:torch.nn.Module=None,
                  loss_f:torch.nn.modules=None,
                  _dl:torch.utils.data.dataloader.DataLoader=None):

    model.eval()  # モデルを評価モードに変更

    sum_loss=0
    count=0
    N = 0

    for inps, tchs in _dl:
        inps = pad_sequence(inps, batch_first=True).to(device)
        tchs = pad_sequence(tchs, batch_first=True).to(device)
        outs = model(inps, tchs)

        # 正解のカウント
        out_ids = [out.argmax(dim=1) for out in outs]
        for tch, out in zip(tchs[:], out_ids[:]):
            yesno = ((tch==out) * 1).sum().cpu().numpy() == len(tch)
            count += 1 if yesno else 0

        # 学習の実行
        loss = 0.
        for j in range(len(tchs)):
            loss += loss_f(outs[j],tchs[j])
        sum_loss += loss.item()

        N += len(tchs)
    p_ = count / N
    #return model, {'sum_loss':sum_loss, 'count':count, 'N':N, 'P':p_}
    return {'sum_loss':sum_loss, 'count':count, 'N':N, 'P':p_}

## 5.1 訓練に用いるモデルを再定義

In [None]:
ds = train_ds
n_layers=1
bidirectional=False
n_hid=512
n_hid=128
n_hid=1024

input_tokenizer=gakushu_tokenizer
output_tokenizer=mora_tokenizer

tla_vanilla = vanilla_TLA(inp_vocab_size=len(input_tokenizer.tokens),
                          out_vocab_size=len(output_tokenizer.tokens),
                          inp_len=2,
                          out_len=ds.dataset.maxlen_out,
                          device=device,
                          n_hid=n_hid).to(device)
print(tla_vanilla.eval())

tla_seq2seq = Seq2Seq_wAtt(enc_vocab_size=len(input_tokenizer.tokens),
                           dec_vocab_size=len(output_tokenizer.tokens),
                           n_layers=n_layers,
                           bidirectional=bidirectional,
                           n_hid=n_hid).to(device)
print(tla_seq2seq.eval())

tla_seq2seq0 = Seq2Seq_woAtt(enc_vocab_size=len(input_tokenizer.tokens),
                             dec_vocab_size=len(output_tokenizer.tokens),
                             n_layers=n_layers,
                             bidirectional=bidirectional,
                             n_hid=n_hid).to(device)
print(tla_seq2seq0.eval())

## 5.2 実際の訓練

In [None]:
model0 = tla_vanilla
model1 = tla_seq2seq
model2 = tla_seq2seq0

loss_f = torch.nn.CrossEntropyLoss(ignore_index=-1)

# 学習率リストとエポック数の定義
iter_params = [(1e-3, 2), (1e-4, 2), (1e-5, 2)]
iter_params = [(1e-3, 10)]
#iter_params = [(1e-3, 20), (1e-4, 10), (1e-5, 5)]

results = [{}, {}, {}]

# 途中結果を印字するタイミング
#interval = 3
interval = 1
#interval = 10

for (lr, epochs) in iter_params: # 学習率とエポック数を定義済のリストに従って変化させる

    # 最適化関数の学習率を設定
    optimizer0 = torch.optim.Adam(model0.parameters(), lr=lr)
    optimizer1 = torch.optim.Adam(model1.parameters(), lr=lr)
    optimizer2 = torch.optim.Adam(model2.parameters(), lr=lr)

    print(f'lr:{lr}, epochs:{epochs}')
    # エポック数だけ学習を行う
    for epoch in range(epochs):
        print(f"エポック:{epoch+1:3d}")

        #for (model, optimizer) in [(model2,optimizer2)]:
        #for (model, optimizer) in [(model0,optimizer0), (model1,optimizer1), (model2,optimizer2)]:
        for N, (model, optimizer) in enumerate([(model2,optimizer2), (model1,optimizer1), (model0,optimizer0)]):

            # 1 エポックの訓練を行う
            out = fit_an_epoch(model=model, _dl=train_dl, loss_f=loss_f, optimizer=optimizer)
            if (epoch % interval) == 0:
                print(f"学習損失値={out['sum_loss']:10.3f}",
                      f"正解率={out['P']:5.3f}",
                      f"({out['count']:5d}/{out['N']:5d})",
                      end="\t")

            if not 'train_loss' in results[N]:
                results[N]['train_loss'] = [out['sum_loss']]
            else:
                results[N]['train_loss'].append(out['sum_loss'])
            if not 'train_P' in results[N]:
                results[N]['train_P'] = [out['P']]
            else:
                results[N]['train_P'].append(out['P'])


        if (epoch % interval) == 0:
            print()

        for N, (model, optimizer) in enumerate([(model2,optimizer2), (model1,optimizer1), (model0,optimizer0)]):

            # 1 エポックの検証を行う
            out = eval_an_epoch(model=model, _dl=valid_dl, loss_f=loss_f)
            if (epoch % interval) == 0:
                print(f"検証損失値={out['sum_loss']:10.3f}",
                      f"正解率={out['P']:5.3f}",
                      f"({out['count']:5d}/{out['N']:5d})",
                      end="\t")

            if not 'valid_loss' in results[N]:
                results[N]['valid_loss'] = [out['sum_loss']]
            else:
                results[N]['valid_loss'].append(out['sum_loss'])
            if not 'valid_P' in results[N]:
                results[N]['valid_P'] = [out['P']]
            else:
                results[N]['valid_P'].append(out['P'])

        if (epoch % interval) == 0:
            print()


In [None]:
#print(results)
for i in range(2):
    plt.plot(results[i]['train_loss'])
    plt.plot(results[i]['valid_loss'])
    #plt.plot(results[i]['train_P'])
    #plt.plot(results[i]['valid_P'])
    print(results[i]['train_P'])
    print(results[i]['valid_P'])



In [None]:
#train_ds.dataset.output_tokenizer
print(results[0])