<a href="https://colab.research.google.com/github/ShinAsakawa/ShinAsakawa.github.io/blob/master/2025notebooks/2025_0626psylex71_CDP%2Bja.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src="https://raw.githubusercontent.com/project-ccap/project-ccap.github.io/refs/heads/master/2025figs/1998Zorzi_CDP_fig1.svg">
Zorzi+(1998) Fig.1 Architecture of the model. The arrow means full connectivity between layers. Each box stand for a group of letters (26) or phonemes (44).<br/>


<img src="https://raw.githubusercontent.com/project-ccap/project-ccap.github.io/refs/heads/master/2025figs/1998Zorzi_CDP_fig8.svg">
<p>Zorzi+(1998) Fig.8. Architecture of the model with the hidden layer pathway. In both the direct pathway and the mediated pathway the layers are fully connected (arrows).</p>



In [None]:
%config InlineBackend.figure_format = 'retina'
import torch
#device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f'device:{device}')

# 必要なライブラリの輸入
from collections import OrderedDict
import sys
import os
import numpy as np
# import time
# import datetime
import operator
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

HOME = os.environ['HOME']

from IPython import get_ipython
isColab =  'google.colab' in str(get_ipython())

try:
    import ipynbname
except ImportError:
    !pip install ipynbname
    import ipynbname

FILEPATH = str(ipynbname.path()).split('/')[-1]
print(f'FILEPATH:{FILEPATH}')

try:
    import japanize_matplotlib
except ImportError:
    !pip install japanize_matplotlib
    import japanize_matplotlib

# モーラ分かち書きの定義    

In [None]:
# モーラ分かち書きの定義
# source https://qiita.com/shimajiroxyz/items/a133d990df2bc3affc12
import re

# 各条件を正規表現で表す
c1 = '[ウクスツヌフムユルグズヅブプヴ][ァィェォ]' #ウ段＋「ァ/ィ/ェ/ォ」
c2 = '[イキシチニヒミリギジヂビピ][ャュェョ]' #イ段（「イ」を除く）＋「ャ/ュ/ェ/ョ」
c3 = '[テデ][ィュ]' #「テ/デ」＋「ャ/ィ/ュ/ョ」
c4 = '[ァ-ヴー]' #カタカナ１文字（長音含む）

cond = '('+c1+'|'+c2+'|'+c3+'|'+c4+')'
re_mora = re.compile(cond)

def moraWakachi(kana_text):
    kana_text = kana_text.replace('ヱ','エ').replace('ヰ','イ')
    return re_mora.findall(kana_text)

# text = 'シンシュンシャンソンショー'
# print(text)
# print(moraWakachi(text))
# print('')

# text = 'トーキョートッキョキョカキョク'
# print(text)
# print(moraWakachi(text))
# print('')

# text = 'アウトバーン'
# print(text)
# print(moraWakachi(text))
# print('')

# text = 'ガッキュウホウカイ'
# print(text)
# print(moraWakachi(text))

# 文字の定義，(学習文字，かな，カナ，数字，記号など)

In [None]:
# 書記素の定義，書記素のうちカタカナを音韻表現としても利用

seed = 42
special_tokens = ['<PAD>', '<EOW>', '<SOW>', '<UNK>']
alphabet_upper_chars='ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ'
alphabet_lower_chars='ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ'
num_chars='０１２３４５６７８９'
hira_chars='ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをん'
kata_chars='ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ'
#kata_chars=kata_chars+'一'  # カタカナ文字に伸ばし記号を加える
#phon_list = list(kata_chars+'一')

# # 句点コード
# from RAM.char_ja import kuten as kuten
# kuten_chars=kuten().chars

# # 常用漢字
# from RAM.char_ja import chars_joyo as chars_joyo
# joyo_chars = "".join([ch for ch in chars_joyo().char_list])

# 学習漢字 学年別
_gakushu_list = ['一右雨円王音下火花貝学気休玉金九空月犬見五口校左三山四子糸字耳七車手十出女小上森人水正生青石赤先千川早草足村大男竹中虫町天田土二日入年白八百文本名木目夕立力林六',
'引羽雲園遠黄何夏家科歌画会回海絵外角楽活間丸岩顔帰汽記弓牛魚京強教近兄形計元原言古戸午後語交光公工広考行高合国黒今才細作算姉市思止紙寺時自室社弱首秋週春書少場色食心新親図数星晴声西切雪線船前組走多太体台谷知地池茶昼朝長鳥直通弟店点電冬刀東当答頭同道読内南肉馬買売麦半番父風分聞米歩母方北妹毎万明鳴毛門夜野矢友曜用来理里話',
'悪安暗委意医育員飲院運泳駅横屋温化荷界開階寒感漢館岸期起客宮急球究級去橋業局曲銀区苦具君係軽決血研県庫湖向幸港号根祭坂皿仕使始指死詩歯事持次式実写者主取守酒受州拾終習集住重宿所暑助勝商昭消章乗植深申真神身進世整昔全想相送息速族他打対待代第題炭短談着柱注丁帳調追定庭笛鉄転登都度島投湯等豆動童農波配倍箱畑発反板悲皮美鼻筆氷表病秒品夫負部服福物平返勉放味命面問役薬油有由遊予様洋羊葉陽落流旅両緑礼列練路和',
'愛案以位囲胃衣印栄英塩央億加果課貨芽改械害街各覚完官管観関願喜器希旗機季紀議救求泣給挙漁競共協鏡極訓軍郡型径景芸欠結健建験固候功好康航告差最菜材昨刷察札殺参散産残司史士氏試児治辞失借種周祝順初唱松焼照省笑象賞信臣成清静席積折節説戦浅選然倉巣争側束続卒孫帯隊達単置仲貯兆腸低停底的典伝徒努灯働堂得特毒熱念敗梅博飯費飛必標票不付府副粉兵別変辺便包法望牧末満未脈民無約勇要養浴利陸料良量輪類令例冷歴連労老録',
'圧易移因営永衛液益演往応恩仮価可河過賀解快格確額刊幹慣眼基寄規技義逆久旧居許境興均禁句群経潔件券検険減現限個故護効厚構耕講鉱混査再妻採災際在罪財桜雑賛酸師志支枝資飼似示識質舎謝授修術述準序承招証常情条状織職制勢性政精製税績責接設絶舌銭祖素総像増造則測属損態貸退団断築張提程敵適統導銅徳独任燃能破判版犯比肥非備俵評貧婦富布武復複仏編弁保墓報豊暴貿防務夢迷綿輸余預容率略留領',
'異遺域宇映延沿我灰拡閣革割株巻干看簡危揮机貴疑吸供胸郷勤筋敬系警劇激穴憲権絹厳源呼己誤后孝皇紅鋼降刻穀骨困砂座済裁策冊蚕姿私至視詞誌磁射捨尺若樹収宗就衆従縦縮熟純処署諸除傷将障城蒸針仁垂推寸盛聖誠宣専泉洗染善創奏層操窓装臓蔵存尊宅担探誕暖段値宙忠著庁潮頂賃痛展党糖討届難乳認納脳派俳拝背肺班晩否批秘腹奮並閉陛片補暮宝訪亡忘棒枚幕密盟模訳優郵幼欲翌乱卵覧裏律臨朗論']

_l = []
for g in _gakushu_list:
    for ch in g:
        _l += ch
gakushu_chars = "".join(ch for ch in _l)

grph_list = []
#for x in [hira_chars]:                       # 数字は入力文字としない場合
#for x in [hira_chars, gakushu_chars]:             # 数字は入力文字としない場合
for x in [hira_chars, kata_chars, num_chars, gakushu_chars]: # 数字も入力文字とする場合
    for ch in x:
        grph_list.append(ch)
print(f'len(grph_list):{len(grph_list)}')
print(f'全書記素 grph_list:{"".join([ch for ch in grph_list])}')

# print(f'len(phon_list):{len(phon_list)}')
# print(f'全音素 phon_list:{phon_list}')

print(f'入力層の素子数 len(grph_list) + len(special_tokens)={len(grph_list) + len(special_tokens)}')
# print(f'出力層の素子数 len(phon_list) + len(special_tokens)={len(phon_list) + len(special_tokens)}')

# NTT 日本語の語彙特性 単語頻度データの読み込み

In [None]:
# NTT 日本語の語彙特性単語頻度データ psylex71.txt の読み込み
#HOME = os.environ['HOME']
ntt_base = os.path.join(HOME, 'study/2017_2009AmanoKondo_NTTKanjiData')
psy71_fname = os.path.join(HOME, ntt_base, 'psylex71utf8_.txt')  # ファイル名
psylex71raw = open(psy71_fname, 'r').readlines()
psylex71raw = [lin.strip().split(' ')[:6] for lin in psylex71raw]   # 空白 ' ' で分離し，年度ごとの頻度を削除
print(f'len(psylex71raw):{len(psylex71raw)}')
#, psylex71raw[1]


valid_chars = kata_chars + 'ー'
mora_dict = OrderedDict()

for x in tqdm(psylex71raw[1:]):
    _word =  x[psylex_ids['_wrd']]
    _yomi = x[psylex_ids['_yomi']]
    is_valid = True
    for ch in _yomi:
        if not ch in valid_chars:
            is_valid = False
    if is_valid:
        morae = moraWakachi(_yomi)
        for m in morae:
            if not m in mora_dict:
                mora_dict[m] = 1
            else:
                mora_dict[m] += 1

print(f'len(mora_dict):{len(mora_dict)}')
mora_list = sorted(mora_dict.keys())

is_graph = False
print(len(mora_dict), mora_dict)
if is_graph:
    N_mora=np.array([v for v in mora_dict.values()]).sum()
    mora_count_sorted = sorted(mora_dict.items(), key=operator.itemgetter(1), reverse=True)
    figsize=(24,4)
    topN = 100
    plt.figure(figsize=figsize)
    plt.bar(range(topN), [x[1]/N_mora for x in mora_count_sorted[:topN]])
    plt.xticks(ticks=range(topN), labels=[c[0] for c in mora_count_sorted[:topN]])

    plt.title(f'モーラ頻度 (上位:{topN} 語)')
    plt.ylabel('相対頻度')
    plt.show()
    #len(mora_dict)

In [None]:
maxlen_grph = 2        # 書記素最大文字数 + 2 しているのは, 単語の前後に特殊トークン <SOW> <EOW> をつけるため
valid_chars=grph_list  # 書記素リスト grph_list を有効文字リスト valid_chars とする
ng_yomi_words = []
dups_idx = []
_psylex71_ = []

#grph_cands=valid_chars
#phon_cands=kata_chars
#phon_cands = phon_list

# Psylex71 一行のデータは 0:共通ID, 1:独自ID, 2:表記, 3:ヨミ, 4:品詞, 5:頻度 を取り出す。
#n_idx=0; n_wrd=2; n_yomi=3; n_pos=4; n_frq=5
psylex_ids = {'_idx':0, '_idx2':1, '_wrd':2, '_yomi':3, '_pos':4, '_frq':5, '_mora':6}
print(f'psylex_ids{psylex_ids}')


Psylex71 = OrderedDict()
for lin in psylex71raw:
    wrd = lin[psylex＿ids['_wrd']]
    idx = lin[psylex＿ids['_idx']]
    yomi = lin[psylex＿ids['_yomi']]
    pos = lin[psylex＿ids['_pos']]
    frq = lin[psylex＿ids['_frq']]

    # print(f'type(lin):{type(lin)}')
    # print(f'lin:{lin}')
    # sys.exit()

    if len(wrd) == maxlen_grph:  # 長さが maxlen_grph 文字である語に対して処理を行う

        # ヨミの中にカタカナ以外の文字が入っていれば NG_flag を True にする
        is_kata_yomi = True
        for p in yomi:
            if not p in kata_chars:
                is_kata_yomi = False

        # ヨミにカタカナ以外の文字が含まれていれば ng_yomi_words に加える
        if is_kata_yomi == False:
            ng_yomi_words.append((wrd,yomi))
        else:

            # valid_chars (学習漢字+)で構成されているか否かを判断
            is_valid_grph = True
            for i in range(maxlen_grph):
                if not wrd[i] in valid_chars:
                    is_valid_grph = False

            if is_valid_grph == True:

                _mora = moraWakachi(yomi) # .strip()  # モーラ分かち書きを行う
                if idx in Psylex71:   # すでに ID 番号が登録されていれば dups_idx リストに加える
                    dups_idx.append((idx, lin, (Psylex71[idx]['単語'], Psylex71[idx]['ヨミ'], _mora)))

                Psylex71[idx] = {'単語': wrd, 'モーラ':_mora, 'ヨミ': yomi, '品詞': pos,'頻度': frq}
                _psylex71_.append(lin + [_mora])


# 読み (音韻表現) の最大長値の探索
maxlen_phon = 0
for a in _psylex71_:
    if len(a[psylex_ids['_mora']]) > maxlen_phon:
         maxlen_phon = len(a[psylex_ids['_mora']])

# 結果の表示
print(f'読み込んだ psylex71.txt の単語数 len(psylex71raw):{len(psylex71raw)}')
print(f'Psylex71 の総単語数 len(_psylex71_):{len(_psylex71_)}')
print(f'作成したデータベース辞書の項目数 len(Psylex71):{len(Psylex71)}')
print(f'ヨミの最長文字数 maxlen_phon:{maxlen_phon}')
print(f'len(mora_list):{len(mora_list)}')
#print(f'音素 (読みのカタカナ文字)数 len(phon_cands):{len(phon_cands)}')
print(f'Psylex71 におけるカタカナ以外のヨミのある単語数 len(ng_yomi_words):{len(ng_yomi_words)}')
print(f'Psylex71 における ID 番号の重複数 len(dups_idx):{len(dups_idx)}')

# `Psylex71_Dataset` (モデルに Psylex71 を学習させるためのクラス) の作成

In [None]:
import torch
class Psylex71_Dataset(torch.utils.data.Dataset):
    '''ニューラルネットワークモデルに Psylex71 を学習させるための PyTorch 用データセットのクラス'''

    def __init__(self,
                 dic=Psylex71,
                 grph_list=grph_list,
                 phon_list=mora_list,
                 special_tokens=special_tokens,
                 maxlen_phon=maxlen_phon +2, # ＋2 しているのは <SOW>,<EOW> という 2 つのスペシャルトークンを付加するため
                ):
        super().__init__()
        self.dic = dic
        self.special_tokens = special_tokens
        self.maxlen_phon = maxlen_phon
        self.grph_list = grph_list
        self.phon_list = phon_list
        self.input_cands = grph_list
        #self.target_cands = special_tokens + phon_list
        self.target_cands = special_tokens + mora_list
        # self.inputs = [v['単語'] for v in dic.values()]
        # self.targets = [v['ヨミ'] for v in dic.values()]
        # self.targets = [v['モーラ'] for v in dic.values()]
        self.inputs = [v['単語'] for v in dic.values()]
        self.targets = [v['ヨミ'] for v in dic.values()]
        self.targets = [v['モーラ'] for v in dic.values()]

    def __len__(self):
        return len(self.dic)

    def __getitem__(self, idx):
        inp, tgt = self.inputs[idx], self.targets[idx]

        # 入力信号にも <SOW>, <EOW> トークンを付与する場合
        #inp = [self.input_cands.index('<SOW>')]  + [self.input_cands.index(x) for x in inp]  + [self.input_cands.index('<EOW>')]

        # 入力信号にはスペシャルトークンを付与しない場合
        inp = [self.input_cands.index(x) for x in inp]

        # ターゲット (教師)信号 には <SOW>, <EOW> を付与する
        tgt = [self.target_cands.index('<SOW>')] + [self.target_cands.index(x) for x in tgt] + [self.target_cands.index('<EOW>')]

        while len(tgt) < self.maxlen_phon:
            tgt = tgt + [self.target_cands.index('<PAD>')]

        inp, tgt = torch.LongTensor(inp), torch.LongTensor(tgt)
        return inp, tgt

    def getitem(self, idx):
        #inp, tgt = self.inputs[idx], self.targets[idx]
        wrd = self.inputs[idx]
        phn = self.targets[idx]
        return wrd, phn

    def ids2argmax(self, ids):
        out = np.array([torch.argmax(idx).numpy() for idx in ids], dtype=np.int32)
        return out

    def ids2tgt(self, ids):
        #out = [self.target_cands[torch.argmax(idx)] for idx in ids]
        out = [self.target_cands[idx - len(self.special_tokens)] for idx in ids]
        return out

    def ids2inp(self, ids):
        out = [self.input_cands[idx] for idx in ids]
        #out = [self.input_cands[idx - len(self.special_tokens)] for idx in ids]
        return out

    def target_ids2target(self, ids:list):
        ret = []
        for idx in ids:
            if idx == self.target_cands.index('<EOW>'):
                return ret+['<EOW>']
            ret.append(self.target_cands[idx])
        return ret


psylex71_ds = Psylex71_Dataset()

_ds = psylex71_ds
#for N in np.random.permutation(psylex71_ds.__len__())[:15]:
for N in range(15):
    inp, tgt = psylex71_ds.__getitem__(N)
    print(f'_ds.ids2inp(inp):{_ds.ids2inp(inp)}',
          f'{inp.numpy()}',
          f'_ds.target_ids2target(tgt):{_ds.target_ids2target(tgt)}',
          f'{tgt.numpy()}')


train_size = int(_ds.__len__() * 0.7)
train_size = int(_ds.__len__() * 0.05)
valid_size = _ds.__len__() - train_size
train_ds, valid_ds = torch.utils.data.random_split(dataset=_ds, lengths=(train_size, valid_size), generator=torch.Generator().manual_seed(seed))

batch_size = 64
#batch_size = 1024
train_dl = torch.utils.data.DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
valid_dl = torch.utils.data.DataLoader(dataset=valid_ds, batch_size=batch_size, shuffle=False)

def _collate_fn(batch):
    inps, tgts = list(zip(*batch))
    inps = list(inps)
    tgts = list(tgts)
    return inps, tgts

# batch_size = 4
train_dl = torch.utils.data.DataLoader(
    dataset=train_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=_collate_fn)

valid_dl = torch.utils.data.DataLoader(
    dataset=valid_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=_collate_fn)

print(f'train_ds.__len__():{train_ds.__len__()}')

_ds = train_ds
for N in range(15):
    inp, tgt = _ds.__getitem__(N)
    print(f'_ds.dataset.ids2inp(inp):{_ds.dataset.ids2inp(inp)}',
          f'{inp.numpy()}',
          f'_ds.datsset.target_ids2target(tgt):{_ds.dataset.target_ids2target(tgt)}',
          f'{tgt.numpy()}')


# TLA モデルの定義

In [None]:
class TLA(torch.nn.Module):
    def __init__(self,
                 # maxlen_phon+2 しているのは単語の前後に <SOW>, <EOW> トークンを付けるため
                 inp_size= (len(grph_list)+len(special_tokens)), # * (maxlen_grph + 2),
                 inp_len=maxlen_grph, #  + 2,
                 out_size=len(mora_list)+len(special_tokens),
                 out_len=maxlen_phon+2,
                 hid_size=128,
                 device=device,
                ):
        super().__init__()
        self.inp_size=inp_size
        self.inp_len=inp_len
        self.out_size=out_size
        self.out_len=out_len
        self.hid_size=hid_size

        self.emb_layers = [torch.nn.Embedding(num_embeddings=inp_size, embedding_dim=hid_size, padding_idx=0).to(device) for _ in range(inp_len)]
        #self.emb_layer = torch.nn.Embedding(num_embeddings=inp_size, embedding_dim=hid_size, padding_idx=0).to(device)

        self.hid_layer = torch.nn.Linear(in_features=hid_size * inp_len, out_features=hid_size).to(device)
        #self.hid_layer = torch.nn.Linear(in_features=inp_len * inp_size, out_features=hid_size)

        self.out_layers = [torch.nn.Linear(in_features=hid_size, out_features=out_size).to(device) for _ in range(out_len)]

    def forward(self, inp):
        X = inp
        batch_size = X.size(0)
        n_grph = X.size(1)

        embs = []
        for i in range(n_grph):
            _emb = self.emb_layers[i](X[:,i])
            #print(f'{i}:_emb.size():{_emb.size()}')
            embs.append(_emb)

        _embs = torch.concat(embs,dim=1)
        X = _embs
        X = self.hid_layer(X)         # 中間層次元へ変換

        # 出力層の音韻表現ごとへ変換
        outputs = []
        for i in range(self.out_len):
            _out = self.out_layers[i](X)
            outputs.append(_out)

        # softmax 変換
        #outputs = [torch.nn.functional.softmax(out,dim=1) for out in outputs]
        outputs = [torch.nn.functional.sigmoid(out) for out in outputs]

        #outputs = torch.cat(outputs, dim=0)
        # outputs = torch.stack(outputs)
        # return outputs

        O = torch.empty(self.out_len, batch_size, self.out_size)
        for i in range(len(outputs)):
            O[i] = outputs[i]
        O = O.reshape(batch_size, self.out_len, self.out_size)
        O = torch.Tensor(O)
        return O

tla = TLA(device=device)
tla.eval()

In [None]:
class vanilla_TLA(torch.nn.Module):
    def __init__(self,
                 inp_size= (len(grph_list)+len(special_tokens)),
                 inp_len=maxlen_grph,
                 out_size=len(mora_list)+len(special_tokens),
                 out_len=maxlen_phon+2,
                 hid_size=128,
                 device=device,
                ):
        super().__init__()
        self.inp_size=inp_size
        self.inp_len=inp_len
        self.out_size=out_size
        self.out_len=out_len
        self.hid_size=hid_size

        self.emb_layer = torch.nn.Linear(in_features=inp_size * inp_len, out_features=hid_size).to(device)
        self.sigmoid = torch.nn.Sigmoid()
        self.tanh = torch.nn.Tanh()
        self.relu = torch.nn.ReLU()
        self.out_layer = torch.nn.Linear(in_features=hid_size, out_features=out_size * out_len).to(device)

    def forward(self, inp):
        X = inp
        X = torch.nn.functional.one_hot(X, num_classes=self.inp_size)
        X = X.reshape(X.size(0),-1)
        X = X.float()
        X = self.emb_layer(X)
        X = self.tanh(X)
        X = self.out_layer(X)
        X = self.sigmoid(X)
        X = X.reshape(X.size(0), self.out_len, self.out_size)

        return X

vanilla_tla = vanilla_TLA(device=device)
vanilla_tla.eval()

# 定義したモデルの試用

In [None]:
# idx に整数を指定して,対応するデータを取得する

_ds = psylex71_ds
_ds = train_ds
idx = np.random.choice(_ds.__len__())
idx = 0

# データセットから返ってくる値は入力信号 inp と教師信号 tch
inp, tch = _ds.__getitem__(idx)
print(f'idx:{idx}:', f'inp:{inp}', f'tch:{tch}')
#print(f'_ds.getitem({idx}):{_ds.getitem(idx)}')
#print(f'_ds.getitem({idx}):{_ds.getitem(idx)}')

# 入出力信号はトークン ID 番号であるため人間が読みやすいように変換して表示
#print(f'_ds.ids2inp({inp}):{_ds.ids2inp(inp)}')
#print(f'_ds.taregt_ids2target({tch}):{_ds.target_ids2target(tch)}')
print(f'_ds.dataset.ids2inp({inp}):{_ds.dataset.ids2inp(inp)}')
print(f'_ds.dataset.taregt_ids2target({tch}):{_ds.dataset.target_ids2target(tch)}')

inp = pad_sequence(inp.unsqueeze(0), batch_first=True).to(device)

outs = tla(inp)
# _ds = train_ds
print('出力:', _ds.dataset.target_ids2target([int(_out.argmax().numpy()) for _out in outs[0]]), end=": ")
print('出力 ids:', [int(_out.argmax().cpu().numpy()) for _out in outs[0]])

tch = tch.cpu()
print('教師:', train_ds.dataset.target_ids2target([idx.numpy() for idx in tch]), end=": ")
print('教師 ids:', [int(_tch.numpy()) for _tch in tch])

# #len(_ds.indices)
# psylex71_ds.__getitem__(_ds.indices[idx])
# psylex71_ds.getitem(_ds.indices[idx])

vanilla_tla.eval()
outs = vanilla_tla(inp)

#print('出力:', train_ds.dataset.ids2tgt([int(_out.argmax().numpy()) for _out in outs[0]]), end=": ")
print('出力:', train_ds.dataset.target_ids2target([int(_out.argmax().cpu().numpy()) for _out in outs[0]]), end=": ")
print('出力 ids:', [int(_out.argmax().cpu().numpy()) for _out in outs[0]])
# print('教師:', train_ds.dataset.target_ids2target([idx.numpy() for idx in tch]), end=": ")
# print('教師 ids:', [int(_tch.numpy()) for _tch in tch])

In [None]:
%%time
tla = vanilla_tla

tla.train()
optimizer = torch.optim.Adam(tla.parameters(), lr=1e-3)
loss_f = torch.nn.CrossEntropyLoss(ignore_index=-1)

inps, tchs = next(iter(train_dl))
inps = pad_sequence(inps, batch_first=True).to(device)
tchs = pad_sequence(tchs, batch_first=True).to(device)
outs = tla(inps)
# print(f'outs.size():{outs.size()}')
# print(f'tchs.size():{tchs.size()}')
# print(f'len(outs):{len(outs)}',
#       f'len(inps):{len(inps)}',
#       f'len(tchs):{len(tchs)}')
# print(f'len(outs[0]):{len(outs[0])}',
#       f'len(inps[0]):{len(inps[0])}',
#       f'len(tchs[0]):{len(tchs[0])}')
losses = 0.
optimizer.zero_grad()
for _out, _tch in tqdm(zip(outs, tchs)):
    losses += loss_f(_out, _tch)
    #print(loss)
losses.backward()
optimizer.step()

In [None]:
# ミニバッチバージョン

tla = vanilla_tla
loss_f = torch.nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(tla.parameters(), lr=1e-2)
epochs = 2
epochs = 10

for epoch in range(epochs):

    tla.train()
    sum_loss = 0.
    count  = 0

    _dl = train_dl
    for inps, tchs in _dl:
    #for inps, tchs in tqdm(_dl):
        inps = pad_sequence(inps, batch_first=True).to(device)
        tchs = pad_sequence(tchs, batch_first=True).to(device)
        outs = tla(inps)

        losses = 0.
        optimizer.zero_grad()
        for j in range(len(tchs)):
            loss = loss_f(outs[j],tchs[j])
            losses += loss
            sum_loss += loss.item()
        losses.backward()
        optimizer.step()

        out_ids = [out.argmax(dim=1) for out in outs]
        for tch, out in zip(tchs[:], out_ids[:]):
            yesno = ((tch==out) * 1).sum().cpu().numpy() == len(tch)
            count += 1 if yesno else 0

    p_correct = count / _dl.__len__()
    print(f'epoch:{epoch+1:03d}', end=" ")
    print(f'p_correct:{p_correct:7.3f}', end=": ")
    print(f'sum_loss:{sum_loss/_dl.__len__():.3f}')

In [None]:
# オンラインバージョン
loss_f = torch.nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(tla.parameters(), lr=0.01)

tla.train()
epochs = 3
for epoch in range(epochs):
    epoch_loss = 0.
    for inp, tch in tqdm(train_ds):
        inp = inp.unsqueeze(0).to(device)
        outs = tla(inp)
        outs =[out.cpu() for out in outs]
        loss = 0.
        for j in range(len(inp)):
            optimizer.zero_grad()
            loss = loss_f(outs[j],tch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
    print(f'epoch_loss:{epoch_loss:.3f}')

In [None]:
#epoch_loss
tla.eval()

# idx に整数を指定して,対応するデータを取得する
idx = np.random.choice(train_ds.__len__())

# データセットから返ってくる値は入力信号 inp と教師信号 tch
inp, tch = train_ds.__getitem__(idx)
print(f'idx:{idx}:', f'inp:{inp}', f'tch:{tch}')

# 入出力信号はトークン ID 番号であるため人間が読みやすいように変換して表示
print(f'train_ds.dataset.ids2inp({inp}):{train_ds.dataset.ids2inp(inp)}')
print(f'train_ds.dataset.ids2tgt({tch}):{train_ds.dataset.ids2tgt(tch)}')

inp = pad_sequence(inp.unsqueeze(0), batch_first=True).to(device)

outs = tla(inp)
outs = [out.cpu() for out in outs]
print('出力:', train_ds.dataset.ids2tgt([int(_out.argmax().numpy()) for _out in outs[0]]), end=": ")
print('出力 ids:', [int(_out.argmax().numpy()) for _out in outs[0]])
#print('出力 ids:', [int(out.argmax().numpy()) for out in outs])

tch = tch.cpu()
print('教師:', train_ds.dataset.ids2tgt([idx.numpy() for idx in tch]), end=": ")
print('教師 ids:', [int(_tch.numpy()) for _tch in tch])

In [None]:
#print('出力:', train_ds.dataset.ids2tgt([int(out.argmax(dim=0).numpy()) for out in outs]), end=": ")

tla.eval()
inps, tchs = next(iter(train_dl))
inps = pad_sequence(inps, batch_first=True).to(device)
tchs = pad_sequence(tchs, batch_first=True)
outs = tla(inps)

out_ids = [out.argmax(dim=1) for out in outs]
count  = 0
for tch, out in zip(tchs[:], out_ids[:]):
    yesno = ((tch==out) * 1).sum().numpy() == len(tch)
    count += 1 if yesno else 0
    #print(yesno)
    #sys.exit()
print(f'count:{count}')
#print(out_ids[:3])

#print(tchs[:3])
#print(f'outs:{[[(len(out), out.size(), out.argmax(dim=1))] for out in outs[:3]]}')
len(outs)
#len(outs[0])

In [None]:
#loss_f(outs[0], tchs[0]) #.size()
#tchs[0].size()
tchs[0]

In [None]:
O = torch.empty(tch.size(0), tch.size(1), outs[0].size(1))
O = torch.empty(tch.size(1), tch.size(0), outs[0].size(1))
#print(O.size())
#help(torch.Tensor)
#tch.size()
#outs[0].size()
#[o.size() for o in outs]
for j in range(outs[0].size(0)):
    O[j] = o
#outs[0][0].size()
#outs[0][0]#[0]
#print(type(O), O.size())
O = O.reshape(tch.size(0), tch.size(1), outs[0].size(1))
#print(type(O), O.size())
#print(O[0].size(), outs[0].size())
for j in range(O.size(0)):
    print(j, loss_f(O[j], tch[j]))

In [None]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)

inp, tch = next(iter(train_dl))
inp = pad_sequence(inp, batch_first=True).to(device)
tch = pad_sequence(tch, batch_first=True)
tla.eval()
out = tla(inp)
#out[0].argmax(dim=1)
[_out.cpu().argmax(dim=1).numpy() for _out in out]
[_out.size() for _out in out]
[_out.detach().cpu().sum(dim=1) for _out in out]
#[_out.detach().cpu().size() for _out in out]
C = [(train_ds.dataset.ids2inp(_inp.cpu().numpy()),train_ds.dataset.ids2tgt(_tch.cpu().numpy())) for (_inp,_tch) in zip(inp, tch)]
#C
#criterion(out[0], tch[0])
#out[0].size(), tch.size()
#print(out[0].size(), tch[0])
#criterion(out[0][0], tch[0])
#type(out)
#type(torch.tensor(out))
#len(out)
#out[0].size()
print('out', len(out), type(out[0]), out[0].size())
print('inp', len(inp), type(inp[0]), inp[0].size())
print('tch', len(tch), type(tch[0]), tch[0].size())

In [None]:
inp, tch = next(iter(train_dl))
# print('out', len(out), type(out[0]), out[0].size())
# print('tch', len(tch), type(tch[0]), tch[0].size())

inp = pad_sequence(inp, batch_first=True).to(device)
tch = pad_sequence(tch, batch_first=True)

# print('out', len(out), type(out[0]), out[0].size())
# print('tch', len(tch), type(tch[0]), tch[0].size())
tla.eval()
out = tla(inp)
print('out', len(out), type(out[0]), out[0].size())
print('tch', len(tch), type(tch[0]), tch[0].size())
#sys.exit()

for j in range(tch.size(0)): #.size()[0]):
    print('out[j]', len(out[j]), type(out[j]), out[j].size())
    print('tch[j]', len(tch[j]), type(tch[j]), tch[j].size())
    #loss_f(out[j], tch[j])

In [None]:
print(tch.size(0), type(tch))
print(out[0].size(), type(out[0]))
print(tch[0])
out[0].size()

In [None]:
inp, tch = train_ds.__getitem__(0)
inp, tch = next(iter(train_dl))
print(inp, tch)
inp = pad_sequence(inp, batch_first=True).to(device)
tch = pad_sequence(tch, batch_first=True)
print(inp, tch)
tla.eval()
out = tla(inp)
for j in range(len(out)):
    print(out[j].size(), tch[j], tch[j].size())


In [None]:
#help(criterion)
# Example of target with class indices
loss_f = torch.nn.CrossEntropyLoss()
out = torch.randn(3, 5, requires_grad=True)
tgt = torch.empty(3, dtype=torch.long).random_(5)
loss = loss_f(out, tgt)
loss.backward()

print(f'type(out):{type(out)}', f'out:{out}')
print(f'type(tgt):{type(tgt)}', f'tgt:{tgt}')
print(f'type(loss):{type(loss)}', f'loss:{loss}')

out = torch.randn(2, 3, 5, requires_grad=True)
tgt = torch.empty(2, 3, dtype=torch.long).random_(5)
loss = 0.
for j in range(out.size()[0]):
    # バッチ毎にまとめて loss 計算
    loss += loss_f(out[j], tgt[j])
    #loss += loss_f(out[j,:,:], tgt[j,:])
    print(f'type(out[j,:,:]):{type(out[j,:,:])}', f'out[j,:,:]:{out[j,:,:]}')
    print(f'type(tgt[j,:]):{type(tgt[j,:])}', f'tgt[j,:]:{tgt[j,:]}')
    print(f'type(loss):{type(loss)}', f'loss:{loss}')

#print(out.size())
# loss = loss_f(out, tgt)
# loss.backward()

# print(f'type(input):{type(input)}', f'input:{input}')
# print(f'type(target):{type(target)}', f'input:{target}')
# print(f'type(output):{type(output)}', f'input:{output}')

# # Example of target with class probabilities
# input = torch.randn(3, 5, requires_grad=True)
# target = torch.randn(3, 5).softmax(dim=1)
# output = loss(input, target)
# output.backward()

In [None]:
tla.inp_size, tla.inp_len
tla.hid_layer
#tla.eval()
#print(outs[0][0].argmax(dim=0))
#print(outs[0][0].argmax(dim=1))
#train_ds.dataset.ids2tgt([int(idx.numpy()) for idx in outs[0][0].argmax(dim=1)])
#print(outs[0][0].size())
#print(outs[0][0])
#print(outs[0])
#print(len(outs), outs[0].size())
#maxlen_phon
#tla.out_layers
#print(f'len(outs):{len(outs)}')
#print(f'len(outs[-]):{len(outs[-1])}')
#print(f'outs[-1].size():{outs[-1].size()}')
#len(outs[0])
#outs[0][0] #.argmax()

In [None]:
from torch.nn.utils.rnn import pad_sequence

tla.eval()
optimizer = torch.optim.Adam(tla.parameters(), lr=0.01)
idx = np.random.choice(train_ds.__len__())

# データセットから返ってくる値は入力信号 inp と教師信号 tch
inp, tch = train_ds.__getitem__(idx)
print(f'idx:{idx}:', f'inp:{inp}', f'tch:{tch}')
criterion:torch.nn.modules.loss=torch.nn.CrossEntropyLoss(ignore_index=-1),

for inp, tgt in train_dl:
    inp_ids = pad_sequence(inp, batch_first=True).to(device)

    #outs = tla(inp.unsqueeze(0).to(device))
    outs = tla(inp_ids) #.to(device)
    print(f'inp_ids.size():{inp_ids.size()}',
          f'type(inp_ids):{type(inp_ids)}',
          f'len(outs):{len(outs)}',
          f'type(outs):{type(outs)}')
    sys.exit()
    for out in outs: # ) #.size()
        print(out.size())
        #print(out.argmax()) #out.size())



In [None]:
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

optimizer = optim.Adam(tla.parameters(), lr=0.01)
criterion:torch.nn.modules.loss=torch.nn.CrossEntropyLoss(ignore_index=-1),

EPOCH_NUM = 30
all_losses = []
for epoch in range(1, EPOCH_NUM+1):

    epoch_loss = 0 # epoch毎のloss
    for inp, tgt in train_dl:
        optimizer.zero_grad()  # 勾配の初期化

        # データをテンソルに変換
        inp_ids = pad_sequence(inp, batch_first=True).to(device)
        tgt_ids = pad_sequence(tgt, batch_first=True).to(device)

        out = tla(inp_ids)
        print(type(out))

        for j in range(out.size()[1]):
            # バッチ毎にまとめて loss 計算
            loss += criterion(out[:, j, :], tgt_ids[:, j])

        epoch_loss += loss.item()
        loss.backward()  # 誤差逆伝播
        optimizer.step()  # パラメータ更新

    # 損失を表示
    print("Epoch %d: %.2f" % (epoch, epoch_loss))
    all_losses.append(epoch_loss)


In [None]:
# import time
import torch.optim as optim

def fit_tla(
    model:torch.nn.modules.module.Module=tla,
    epochs:int=10,
    ds:Dataset=train_ds,
    #batch_size=batch_size,
    collate_fn=_collate_fn,
    dataloader:torch.utils.data.dataloader.DataLoader=train_dl,
    optimizer:torch.optim=None,
    #criterion:torch.nn.modules.loss=torch.nn.NLLLoss(ignore_index=-1),
    criterion:torch.nn.modules.loss=torch.nn.CrossEntropyLoss(ignore_index=-1),
    interval:int=None,
    isPrint:bool=False,
    losses:list=None,
    isDraw:bool=True,):

    start_time = time.time()   # 開始時刻の保存

    if losses == None:
        losses = []

    model.train()

    if optimizer == None:
        optimizer = optim.Adam(model.parameters(), lr=0.001)

    if interval == None:
        interval = int(ds.__len__()/batch_size) >> 2

    for epoch in range(epochs):
        i = 0
        for inp, tch in dataloader:
            #_tch = torch.nn.functional.one_hot(tch, num_classes=(len(kata_list)+len(special_tokens))).to(device)
            outs = model(inp)

            print(f'train_ds.dataset.ids2inp({inp[0]}):{train_ds.dataset.ids2inp(inp[0])}')
            print(f'train_ds.dataset.ids2tgt({tch[0]}):{train_ds.dataset.ids2tgt(tch[0])}')
            XX = [out.argmax(dim=1).numpy() for out in outs]
            #print(f'len(XX):{len(XX)}')
            print(f'len(inp):{len(inp)}')
            print(f'len(tch):{len(tch)}')
            print(f'len(outs):{len(outs)}')
            print(f'outs[0].size():{outs[0].size()}')
            print(f'tch.size():{tch.size()}')
            #print(f'_tch[0].size:{_tch[0].size}')
            #print(f'train_ds.dataset.ids2tgt(XX[0]):{train_ds.dataset.ids2tgt(XX[0])}')
            _outs = train_ds.dataset.ids2tgt([out.argmax().numpy() for out in outs])
            print(f'出力:{"".join(ch for ch in _outs)}')

            #loss = criterion(out[0], tch[0].long())
            #loss = criterion(out[0], tch[0])
            #loss = criterion(out[0], tch[0])
            #loss = criterion(out[0][0], tch[0])
            loss = criterion(outs[0], tch)
            #sys.exit()

            #loss = criterion(out, _tch)
            for h in range(1,len(tch)):
                loss += criterion(out[h], tch[h])
            losses.append(loss.item()/len(_inp))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            i += 1
            if (i % interval) == 0:
                print(f'epoch:{epoch+1:2d}',
                      f'batch:{i:2d}',
                      f'loss:{loss.item()/batch_size:.5f}')

    end_time = time.time()
    total_time = end_time - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))

    if isDraw:
        plt.plot(losses)
        plt.title(f'epochs:{epochs}, batch_size:{batch_size}, n_hid:{n_hid}, n_layers:{n_layers}, time collapsed:{total_time_str}')
        plt.show()

    return {'Training time':total_time_str,
            'losses': losses,
            'optimizer': optimizer,
            'time': total_time
           }

fit_tla(epochs=1, model=tla, ds=train_ds,interval=1)


In [None]:
#len(kata_list)+len(special_tokens)
#print(train_dl.batch_size)
help(torch.nn.CrossEntropyLoss)

In [None]:
for inp, out in valid_dl:
    print(out[0].size())
    print(torch.nn.functional.one_hot(out, num_classes=len(kata_list)+len(special_tokens))[0][0])
    #print(torch.nn.functional.one_hot(out[0], num_classes=len(kata_list)+len(special_tokens))[0])
    sys.exit()

In [None]:
help(torch.nn.functional.one_hot)

In [None]:
# 上のセルで作成した Psylex71 を pandas のデータフレームに変換し， さらにエクセルファイルとして書き出す
import pandas as pd

NTT71 = pd.DataFrame.from_dict(data=Psylex71, orient='index')

NTT71.columns=['単語','ヨミ','品詞','頻度']
#NTT71.to_excel('NTT_Psylex71_学習漢字2文字語.xlsx')
NTT71

In [None]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

class Seq2Seq_wAtt(nn.Module):
    """ 注意つき符号化器‐復号化器モデル
    Bahdanau, Cho, & Bengio (2015) NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND TRANSLATE, arXiv:1409.0473
    """
    def __init__(self,
                 enc_vocab_size:int,
                 dec_vocab_size:int,
                 n_hid:int,
                 n_layers:int=2,
                 bidirectional:bool=False):
        super().__init__()

        # Encoder 側の入力トークン id を多次元ベクトルに変換
        self.encoder_emb = nn.Embedding(num_embeddings=enc_vocab_size,
                                        embedding_dim=n_hid,
                                        padding_idx=0)

        # Decoder 側の入力トークン id を多次元ベクトルに変換
        self.decoder_emb = nn.Embedding(num_embeddings=dec_vocab_size,
                                        embedding_dim=n_hid,
                                        padding_idx=0)

        # Encoder LSTM 本体
        self.encoder = nn.LSTM(input_size=n_hid,
                               hidden_size=n_hid,
                               num_layers=n_layers,
                               batch_first=True,
                               bidirectional=bidirectional)

        # Decoder LSTM 本体
        self.decoder = nn.LSTM(input_size=n_hid,
                               hidden_size=n_hid,
                               num_layers=n_layers,
                               batch_first=True,
                               bidirectional=bidirectional)

        # 文脈ベクトルと出力ベクトルの合成を合成する層
        bi_fact = 2 if bidirectional else 1
        self.combine_layer = nn.Linear(bi_fact * 2 * n_hid, n_hid)

        # 最終出力層
        self.out_layer = nn.Linear(n_hid, dec_vocab_size)

    def forward(self, enc_inp, dec_inp):

        enc_emb = self.encoder_emb(enc_inp)
        enc_out, (hnx, cnx) = self.encoder(enc_emb)

        dec_emb = self.decoder_emb(dec_inp)
        dec_out, (hny, cny) = self.decoder(dec_emb,(hnx, cnx))

        # enc_out は (バッチサイズ，ソースの単語数，中間層の次元数)
        # ソース側 (enc_out) の各単語とターゲット側 (dec_out) の各単語との類似度を測定するため
        # 両テンソルの内積をとるため ソース側 (enc_out) の軸を入れ替え
        enc_outP = enc_out.permute(0,2,1)

        # sim の形状は (バッチサイズ, 中間層の次元数，ソースの単語数)
        sim = torch.bmm(dec_out, enc_outP)

        # sim の各次元のサイズを記録
        batch_size, dec_word_size, enc_word_size = sim.shape

        # sim に対して，ソフトマックスを行うため形状を変更
        simP = sim.reshape(batch_size * dec_word_size, enc_word_size)

        # simP のソフトマックスを用いて注意の重み alpha を算出
        alpha = F.softmax(simP,dim=1).reshape(batch_size, dec_word_size, enc_word_size)

        # 注意の重み alpha に encoder の出力を乗じて，文脈ベクトル c_t とする
        c_t = torch.bmm(alpha, enc_out)

        # torch.cat だから c_t と dec_out とで合成
        dec_out_ = torch.cat([c_t, dec_out], dim=2)
        dec_out_ = self.combine_layer(dec_out_)

        return self.out_layer(dec_out_)


class Seq2Seq_(nn.Module):
    """ 注意なし符号化器‐復号化器モデル
    Bahdanau, Cho, & Bengio (2015) NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND TRANSLATE, arXiv:1409.0473
    """
    def __init__(self,
                 enc_vocab_size:int,
                 dec_vocab_size:int,
                 n_hid:int,
                 n_layers:int=2,
                 bidirectional:bool=False):
        super().__init__()

        # Encoder 側の入力トークン id を多次元ベクトルに変換
        self.encoder_emb = nn.Embedding(num_embeddings=enc_vocab_size,
                                        embedding_dim=n_hid,
                                        padding_idx=0)

        # Decoder 側の入力トークン id を多次元ベクトルに変換
        self.decoder_emb = nn.Embedding(num_embeddings=dec_vocab_size,
                                        embedding_dim=n_hid,
                                        padding_idx=0)

        # Encoder LSTM 本体
        self.encoder = nn.LSTM(input_size=n_hid,
                               hidden_size=n_hid,
                               num_layers=n_layers,
                               batch_first=True,
                               bidirectional=bidirectional)

        # Decoder LSTM 本体
        self.decoder = nn.LSTM(input_size=n_hid,
                               hidden_size=n_hid,
                               num_layers=n_layers,
                               batch_first=True,
                               bidirectional=bidirectional)

        # 文脈ベクトルと出力ベクトルの合成を合成する層
        bi_fact = 2 if bidirectional else 1
        self.combine_layer = nn.Linear(bi_fact * 2 * n_hid, n_hid)

        # 最終出力層
        self.out_layer = nn.Linear(n_hid, dec_vocab_size)

    def forward(self, enc_inp, dec_inp):

        enc_emb = self.encoder_emb(enc_inp)
        enc_out, (hnx, cnx) = self.encoder(enc_emb)

        dec_emb = self.decoder_emb(dec_inp) # .zero_()
        dec_out, (hny, cny) = self.decoder(dec_emb,(hnx, cnx))

        # enc_out は (バッチサイズ，ソースの単語数，中間層の次元数)
        # ソース側 (enc_out) の各単語とターゲット側 (dec_out) の各単語との類似度を測定するため
        # 両テンソルの内積をとるため ソース側 (enc_out) の軸を入れ替え
        enc_outP = enc_out.permute(0,2,1)

        # sim の形状は (バッチサイズ, 中間層の次元数，ソースの単語数)
        sim = torch.bmm(dec_out, enc_outP)

        # sim の各次元のサイズを記録
        batch_size, dec_word_size, enc_word_size = sim.shape

        # sim に対して，ソフトマックスを行うため形状を変更
        simP = sim.reshape(batch_size * dec_word_size, enc_word_size)

        # simP のソフトマックスを用いて注意の重み alpha を算出
        alpha = F.softmax(simP,dim=1).reshape(batch_size, dec_word_size, enc_word_size)

        # 注意の重み alpha に encoder の出力を乗じて，文脈ベクトル c_t とする
        c_t = torch.bmm(alpha, enc_out)

        # torch.cat だから c_t と dec_out とで合成
        dec_out_ = torch.cat([c_t, dec_out], dim=2)
        dec_out_ = self.combine_layer(dec_out_)

        return self.out_layer(dec_out_)


# 以下確認作業
ds = train_ds
n_layers=1
bidirectional=False
n_hid = 128
batch_size = 256
cdp = Seq2Seq_(enc_vocab_size=len(ds.dataset.grph_list),
#cdp = Seq2Seq_wAtt(enc_vocab_size=len(ds.dataset.grph_list),
                   dec_vocab_size=len(ds.dataset.phon_list),
                   n_layers=n_layers,
                   bidirectional=bidirectional,
                   n_hid=n_hid).to(device)
print(cdp.eval())

In [None]:
# #print(dir(cdp.decoder))
# print(torch.tensor([3,3]).zero_())
print(dir(cdp.decoder))
cdp.decoder.eval()

In [None]:
def fit_seq2seq(
    model:torch.nn.modules.module.Module=cdp,
    epochs:int=10,
    ds:Dataset=psylex71_ds,
    batch_size=batch_size,
    collate_fn=_collate_fn,
    #dataloader:torch.utils.data.dataloader.DataLoader=dl_o2p,
    optimizer:torch.optim=None,
    criterion:torch.nn.modules.loss=nn.CrossEntropyLoss(ignore_index=-1),
    interval:int=None,
    isPrint:bool=False,
    losses:list=None,
    isDraw:bool=True,):
    """ Seq2seq の訓練に用いる関数"""

    start_time = time.time()   # 開始時刻の保存

    dataloader = DataLoader(
        dataset=ds,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        collate_fn=collate_fn)


    if losses == None:
        losses = []

    model.train()

    if optimizer == None:
        #optimizer = optim.Adam(model.parameters(), lr=0.001)
        optimizer = optim.Adam(model.parameters(), lr=0.0001)

    if interval == None:
        interval = int(ds.__len__()/batch_size) >> 2
        #interval = int(ds.__len__()/batch_size) >> 3

    for epoch in range(epochs):
        i = 0
        for _inp, _tch in dataloader:
            enc_inp = pad_sequence(_inp, batch_first=True).to(device)
            dec_inp = pad_sequence(_tch, batch_first=True).to(device)
            tch = pad_sequence(_tch, batch_first=True, padding_value=-1.0).to(device)
            out = model(enc_inp, dec_inp)
            loss = criterion(out[0], tch[0])
            for h in range(1,len(tch)):
                loss += criterion(out[h], tch[h])
            losses.append(loss.item()/len(_inp))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            i += 1
            if (i % interval) == 0:
                print(f'epoch:{epoch+1:2d}',
                      f'batch:{i:03d}',
                      f'loss:{loss.item()/batch_size:.5f}')

    end_time = time.time()
    total_time = end_time - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))

    if isDraw:
        plt.plot(losses)
        plt.title(f'epochs:{epochs}, batch_size:{batch_size}, n_hid:{n_hid}, n_layers:{n_layers}, time collapsed:{total_time_str}')
        plt.show()

    return {'Training time':total_time_str,
            'losses': losses,
            'optimizer': optimizer,
            'time': total_time
           }

# _ = fit_seq2seq(epochs=2, model=cdp, ds=train_ds)

In [None]:
def eval_seq2seq(
    model:torch.nn.modules.module.Module=cdp,
    ds:Dataset=train_ds,
    isPrint:bool=False,
    errors:list=None):

    model.eval()
    if errors == None:
        errors=[]

    for N in tqdm(range(ds.__len__())):
        x, y = ds.__getitem__(N)
        enc_inp, dec_inp = x.unsqueeze(0).to(device), y.unsqueeze(0).to(device)
        grand_truth = y.detach().numpy()[1:-1]
        y_hat = model(enc_inp, dec_inp).to('cpu')
        y_hat = np.argmax(y_hat.squeeze(0).detach().numpy(), axis=1)[1:-1]

        if len(y_hat) == len(grand_truth):
            n_correct = np.array((y_hat == grand_truth).sum())
            isOK = n_correct == len(grand_truth)
        else:
            isOK = False

        if not isOK:
            #wrd = ds.getitem(N)[0]
            wrd = ds.dataset.getitem(N) #[0]
            _out = ds.dataset.target_ids2target(y_hat)
            errors.append((N, wrd, _out,y_hat))
            if isPrint:
                color = 'grey' if isOK else 'red'
                wrd = ds.getitem(N)[0]
                print(colored(f'{N:05d}', color),
                      colored(wrd, color='grey'), # , attrs=["bold"]),
                      colored(y_hat,color,attrs=["bold"]),
                      colored(ds.target_ids2target(y_hat), color, attrs=["bold"]),
                      f'<-{ds.target_ids2target(grand_truth)}')

    cr = len(errors) / N
    return {'エラー':errors,
            '正解率': (1.-cr) * 100}

#_ = eval_seq2seq(model=cdp, ds=train_ds)

In [None]:
%%time
train_epochs, eval_epochs = [],[]
for _ in range(5):
    train_epochs.append(fit_seq2seq(epochs=30, model=cdp, ds=train_ds, isDraw=False))
    eval_epochs.append(eval_seq2seq(model=cdp, ds=valid_ds))

In [None]:
eval_epochs[-1]

In [None]:
cdp.eval()
inp, tch = psylex71_ds.__getitem__(0)
cdp(inp.unsqueeze(0), tch.unsqueeze(0))

# JALEX の処理

In [None]:
len(valid_chars), len(grph_list)
valid_chars = grph_list

In [None]:
import pandas as pd

# Mecab を使ってヨミを得るために MeCab を import する
from ccap.mecab_settings import wakati, yomi #, parser

jalex_base = os.path.join(HOME, 'study/2025_2014jalex')
jalex_xls_fname = 'JALEX.xlsx'
jalex_fname = os.path.join(jalex_base, jalex_xls_fname)
DF = pd.read_excel(jalex_fname)
jalex_words = DF['目標語']

df_dict = DF.to_dict(orient='index')

jalex_cands = []
maxlen_grph = 8
for idx, v in list(df_dict.items())[:]:
    wrd = v['目標語']

    is_valid = False
    if len(wrd) <= maxlen_grph: # maxlen_grph 文字で構成された語のみを選択する
        for w in wrd:
            if w in valid_chars:
                is_valid = True
            else:
                is_valid = False
    if is_valid:
        jalex_cands.append(v)
        _yomi = "".join(c for c in yomi(wrd).strip())
        morae = moraWakachi(_yomi)
        for m in morae:
            if not m in mora_dict:
                print(f'モーラ:{m} not in dict')
                sys.exit()
                mora_dict[m] = 1
            else:
                mora_dict[m] += 1

        df_dict[idx]['モーラ'] = morae
        jalex_cands.append(v)


In [None]:
print(len(mora_dict), sorted(mora_dict.keys()))

In [None]:
#moraWakachi(yomi('鼻血').strip())
#print(valid_chars)
print(list(df_dict.keys())[:10])
df_dict

In [None]:
# MeCab を使ってヨミを得る
_yomis = []
for x in jalex_cands[:]:
    wrd = x['目標語']
    _yomi = yomi(wrd).strip()
    _yomis.append(_yomi)
print(len(_yomis))

Jalex_df['ヨミ'] =_yomis
Jalex_df.columns
cols = ['目標語', 'ヨミ', '項目', '試行数', '平均反応時間 ミリ秒', '反応時間の標準偏差', '反応時間の標準誤差', '正解率', '英訳語', 'ONS 書記素隣接語数', 'PNS 音韻隣接語数', 'OLD20']
Jalex_df = Jalex_df[cols]

# エクセルファイルとして書き出す
#Jalex_df.to_excel('2025_0610Jalex学習漢字2文字語.xlsx')

In [None]:
special_tokens = ['<PAD>', '<EOW>', '<SOW>', '<UNK>']
n_stokens = len(special_tokens)
maxlen_grph = 4
phon_maxlen = 10

NTT71_data = []
for x, v in NTT71[['語','ヨミ']].to_dict(orient='index').items():
    orth, phon = v['語'], v['ヨミ']
    #orth_idx = [gakushu_chars.index(o)+n_stokens for o in orth]
    orth_idx = [valid_chars.index(o)+n_stokens for o in orth]
    orth_idx = [special_tokens.index('<SOW>')] + orth_idx + [special_tokens.index('<EOW>')]

    phon_idx = [kata_chars.index(p)+n_stokens for p in phon]
    phon_idx = [special_tokens.index('<SOW>')] + phon_idx + [special_tokens.index('<EOW>')]
    for j in range(phon_maxlen - len(phon_idx)):
        phon_idx.append(special_tokens.index('<PAD>'))
    NTT71_data.append((orth, orth_idx, phon, phon_idx))
len(NTT71_data)

In [None]:
inconsist = []
for d in NTT71_data:
    yomi2 = yomi(d[0]).strip()
    if d[2] != yomi2:
        inconsist.append((d[0], d[2], yomi2))
print(f'ヨミが NTTPsylex71 と Mecab で不一致な語数 len(inconsist):{len(inconsist)}')

print(NTT71_data[0])
print(f'入力素子数:{(len(valid_chars) + len(special_tokens)) * maxlen_orth}')
print(f'(len(kata_chars)+len(special_tokens)) * phon_maxlen:{(len(kata_chars)+len(special_tokens)) * phon_maxlen}')


In [None]:
print(gakushu_chars[:10])

In [None]:
target_words = []
char_list = jchar_list
char_list = gakushu_chars
for wrd in jalex_words:
    if len(wrd) == 2:
        if (wrd[0] in char_list) and (wrd[1] in char_list):
            target_words.append(wrd)
print(f'len(target_words):{len(target_words)}')

jalex_dict = {}
for wrd in target_words[:100]:
    #print(f'{wrd}:{yomi(wrd).strip()}')
    jalex_dict[wrd] = (wrd, yomi(wrd).strip())
print(jalex_dict)

In [None]:
# 日本語文字
import IPython
isColab = 'colab' in str(IPython.get_ipython())

import numpy as np
import os

alphabet_upper_chars = 'ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ'
alphabet_lower_chars = 'ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ'
num_chars = '０１２３４５６７８９'
hira_chars = 'ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをん'
kata_chars = 'ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ'

jchars = ['<PAD>', '<EOW>', '<SOW>', '<UNK>']

for ch in hira_chars+kata_chars+gakushu_chars:
    for ch in x:
        jchars.append(ch)
print(f'len(jchars):{len(jchars)}')


def make_psylex71_data(psylex71_fname:str=None):
    """
    # NTT日本語語彙特性 (天野，近藤; 1999, 三省堂)より頻度情報を取得

    戻り値:
    * psylex71_freq_sorted: list 頻度順にソートされた単語リスト
    * _psylex_dict: dict 単語をキーとした，品詞，頻度，よみ(カタカナ表記された) を値として持つ辞書
    * all_words: list psylex71 全単語からなるリスト
    * _psylex71_symbols: 登録単語の品詞が｢記｣である単語からなるリスト
    """

    # NTT 日本語語彙特性の utf-8 へ変換済 `psylex71.txt` の読み込み
    HOME = os.environ["HOME"]
    if isColab:
        ntt_base = '/contents'
    else:
        ntt_base = os.path.join(HOME, 'study/2017_2009AmanoKondo_NTTKanjiData')

    if psylex71_fname == None:
        psy71_fname = 'psylex71utf8.txt'  # ファイル名
        with open(os.path.join(ntt_base, psy71_fname), 'rt', encoding='utf-8') as f:
            psylex71raw = f.readlines()
    else:
        with open(ps71_fname, 'r') as f:
            psylex71raw = f.readlines()

    # 空白で区切られたフィードの前頭 5 つ tmp に格納
    tmp  = [line.strip().split(' ')[:6] for line in psylex71raw]

    # 上で定義した tmp から 0:ID, 2:表記, 4:品詞, 5:頻度, 4:読み を取り出す。
    tmp2 = [[int(line[0]), line[2], line[4], int(line[5]), line[3]] for line in tmp]

    tmp3 = []
    for line in tmp2:
        wrd = line[1]
        if len(wrd) == 2:
            tmp3.append(line)

    cands  = []
    _dict = {}
    for x in tmp3:
        idx = x[0]
        wrd = x[1]

        wrd = wrd.replace('・', '')
        if len(wrd) == 2:
            if (wrd[0] in jchars) and (wrd[1] in jchars):
                cands.append(x)
                _dict[idx] = {'wrd': wrd, 'POS': x[2], 'Frq': int(x[3]), 'yomi': x[4]}


    # Freq = np.zeros((len(_dict)), dtype=np.uint)
    # for i, (k,v) in enumerate(_dict.items()):
    #     Freq[i] = v['頻度']

    # all_words = list(_dict.keys())
    # Freq_sorted = np.argsort(Freq)[::-1]  # 頻度降順に並べ替え
    # _freq_sorted = [all_words[idx] for idx in Freq_sorted]

    # _char_freq = {}
    # for wrd in _dict.keys():
    #     for ch in wrd:
    #         if not ch in _char_freq:
    #             _char_freq[ch] = _dict[wrd]['頻度']
    #         else:
    #             _char_freq[ch] += _dict[wrd]['頻度']

    # return _freq_sorted, _dict, all_words, _symbols, _char_freq
    return cands, tmp3, _dict

#psylex71_freq, psylex71_dict, all_words, psylex71_symbols, psylex71_char_freq = make_psylex71_data()
#print(psylex71_dict)
#print(all_words)

psylex71_camds, X, dic = make_psylex71_data()
print(psylex71_data[:10])

In [None]:
print(len(dic))
print(len(X)) #X[-10:]
print(len(psylex71_data))
print(X[:10])

In [None]:
# Plaut の PMSP データを読み込む
import IPython
isColab = True if 'google.colab' in str(IPython.get_ipython()) else False

import numpy as np
import os
import requests

HOME = os.environ['HOME']

pmsp_url='https://www.cnbc.cmu.edu/~plaut/xerion/PMSPdata.txt'
# このデータは以下のような一行一データであり tsv ファイルである。
# 以下のような構造である
#orth\tphon\ttype\tSim 1\t\tSim 2 (raw)\tSim 2 (sqrt)\tSim 3 (RT)\n',

if isColab:
    xerion_dir = ''
else:
    xerion_dir = 'study/2022plaut_homepage/xerion'
pmsp_fname = 'PMSPdata.txt'
fname = os.path.join(HOME, xerion_dir, pmsp_fname)

# もしファイルが存在しなかったら ダウンロードする
if not os.path.exists(fname):
    r = requests.get(pmsp_url)
    with open(fname, 'w') as f:
        total_length = int(r.headers.get('content-length'))
        print('Downloading {0} - {1} bytes'.format(pmsp_fname, (total_length)))
        f.write(r.content)

with open(fname, 'r') as f:
    a = f.readlines()

Z = {}
for i,l in enumerate(a):
    x = l.strip().split('\t')
    if len(x) != 7:
        print(x)
    else:
        Z[i] = {'orth':x[0],
                'phon':x[1],
                'type':x[2],
                'Sim1':np.float32(x[3]),
                'Sim2_raw':np.float32(x[4]),
                'Sim2_sqrt':np.float32(x[5]),
                'Sim3_RT':np.float32(x[6]),
               }

orth_list =[v['orth'] for k, v in Z.items()]
phon_list =[v['phon'] for k, v in Z.items()]

Orth, Phon = {},{}
Orth['<PAD>'] = {'idx': 0}
Orth['<UNK>'] = {'idx': 1}
Orth['<SOW>'] = {'idx': 2}
Orth['<EOW>'] = {'idx': 3}
Phon['<PAD>'] = {'idx': 0}
Phon['<UNK>'] = {'idx': 1}
Phon['<SOW>'] = {'idx': 2}
Phon['<EOW>'] = {'idx': 3}

ort2phn, phn2ort = {}, {}
for o, p in zip(orth_list, phon_list):
    for _o in o:
        if not _o in Orth:
            Orth[_o] = {'idx': len(Orth), 'cnt':1}
        else:
            Orth[_o]['cnt'] += 1
    for _p in p:
        if not _p in Phon:
            Phon[_p] = {'idx': len(Phon), 'cnt':1}
        else:
            Phon[_p]['cnt'] += 1
    ort2phn[o] = p
    phn2ort[p] = o

print(f'len(Orth):{len(Orth)}, Orth: {Orth}')
print(f'len(Phon):{len(Phon)}, Phon: {Phon}')
#print(sorted(set(Orth.keys())))
#print(sorted(set(Phon.keys())))
ort2idx = {k:v['idx'] for k, v in Orth.items()}
#idx2ort = ort2idx.keys() # {v:k for k, v in ort2idx.items()}
idx2ort = list(ort2idx.values()) # {v:k for k, v in ort2idx.items()}
phn2idx = {k:v['idx'] for k, v in Phon.items()}
idx2phn = list(phn2idx.values()) # {k:v['idx'] for k, v in Phon.items()}
#idx2ort = {v:k for k, v in phn2idx.items()}
print(f'len(ort2phn): {len(ort2phn)}')
#print(f'ort2phn: {ort2phn}')
print(f'len(phn2ort): {len(phn2ort)}')
#print(f'phn2ort: {phn2ort}')
print(f'ort2idx: {ort2idx}')
print(f'idx2ort: {idx2ort}')
print(f'phn2idx: {phn2idx}')
print(f'idx2ort: {idx2ort}')

In [None]:
# D = Orth
# count = {}
# for k, v in D.items():
#     if 'cnt' in list(v.keys()):
#         count[k] = v['cnt']
# count_sorted = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
# plt.figure(figsize=(14,4))
# N = np.array([x[1] for x in count.items()]).sum()
# plt.bar(range(len(count_sorted)), [x[1]/N for x in count_sorted])
# plt.xticks(ticks=range(len(count_sorted)), labels=[c[0] for c in count_sorted])
# plt.title('文字頻度')
# plt.grid()
# #plt.savefig('2023_1113chihaya_charfreq.pdf')
# plt.show()

for D, _title in [(Phon, 'Phon'), (Orth, 'Orth')]:
    count = {}
    for k, v in D.items():
        if k == '/':
            continue
        if 'cnt' in list(v.keys()):
            count[k] = v['cnt']
    count_sorted = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
    plt.figure(figsize=(14,4))
    N = np.array([x[1] for x in count.items()]).sum()
    plt.bar(range(len(count_sorted)), [x[1]/N for x in count_sorted])
    plt.xticks(ticks=range(len(count_sorted)), labels=[c[0] for c in count_sorted])
    plt.title(f'{_title} 文字頻度')
    plt.grid()
    #plt.savefig('2023_1113chihaya_charfreq.pdf')
    plt.show()



In [None]:
Phon

In [None]:
def pmsp_ort2idx(word:str,
                 ort2idx:dict=ort2idx):
    ret = [ort2idx[c] if c in ort2idx else ort2idx['<UNK>'] for c in word]
    return ret


def pmsp_idx2ort(ids:list,
                idx2ort:list=idx2ort):
    return [idx2ort.index(idx) for idx in ids]


for i, wrd in enumerate(orth_list):
    print(wrd, end=": ") #, ort2idx['<SOW>'], end=" ")
    print(pmsp_ort2idx(wrd))
    print(pmsp_idx2ort(pmsp_ort2idx(wrd)))
    print(ort2phn[wrd])

    if i >= 2:
          break
['grph', 'phon', 'type', 'Sim 1', '', 'Sim 2 (raw)', 'Sim 2 (sqrt)', 'Sim 3 (RT)']
ace: [4, 5, 6]
[4, 5, 6]
/As/
ache: [4, 5, 7, 6]
[4, 5, 7, 6]
/Ak/
act: [4, 5, 8]
[4, 5, 8]
/@kt/
#print(ort2idx)
#print(idx2ort)
#[1,2,3].index(2)
idx2ort
['<PAD>',
 '<UNK>',
 '<SOW>',
 '<EOW>',
 'a',
 'c',
 'e',
 'h',
 't',
 'd',
 'f',
 'g',
 'i',
 'l',
 'm',
 'r',
 's',
 'p',
 'n',
 'k',
 'u',
 'w',
 'x',
 'b',
 '*',
 'y',
 'z',
 'o',
 'v',
 'j',
 'q']
# coding: utf-8
#import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx

#import data
#import model

# parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
# parser.add_argument('--data', type=str, default='./data/wikitext-2',
#                     help='location of the data corpus')
# parser.add_argument('--model', type=str, default='LSTM',
#                     help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
# parser.add_argument('--emsize', type=int, default=200,
#                     help='size of word embeddings')
# parser.add_argument('--nhid', type=int, default=200,
#                     help='number of hidden units per layer')
# parser.add_argument('--nlayers', type=int, default=2,
#                     help='number of layers')
# parser.add_argument('--lr', type=float, default=20,
#                     help='initial learning rate')
# parser.add_argument('--clip', type=float, default=0.25,
#                     help='gradient clipping')
# parser.add_argument('--epochs', type=int, default=40,
#                     help='upper epoch limit')
# parser.add_argument('--batch_size', type=int, default=20, metavar='N',
#                     help='batch size')
# parser.add_argument('--bptt', type=int, default=35,
#                     help='sequence length')
# parser.add_argument('--dropout', type=float, default=0.2,
#                     help='dropout applied to layers (0 = no dropout)')
# parser.add_argument('--tied', action='store_true',
#                     help='tie the word embedding and softmax weights')
# parser.add_argument('--seed', type=int, default=1111,
#                     help='random seed')
# parser.add_argument('--cuda', action='store_true',
#                     help='use CUDA')
# parser.add_argument('--log-interval', type=int, default=200, metavar='N',
#                     help='report interval')
# parser.add_argument('--save', type=str, default='model.pt',
#                     help='path to save the final model')
# parser.add_argument('--onnx-export', type=str, default='',
#                     help='path to export the final model in onnx format')

# parser.add_argument('--nhead', type=int, default=2,
#                     help='the number of heads in the encoder/decoder of the transformer model')
# parser.add_argument('--dry-run', action='store_true',
#                     help='verify the code and the model')

# args = parser.parse_args()

class _parser():
    def __init__(self):

        self.data = '/Users/_asakawa/study/2020pytorch_examples.git/word_language_model/data/wikitext-2/'
        self.model = 'LSTM'
        self.emsize = 200
        self.nhid = 200
        self.nlayers = 2
        self.lr = 20
        self.clip = 0.25
        self.epochs = 40
        self.batch_size = 20
        self.bptt = 35
        self.dropout = 0.2
        self.tied = True
        self.seed = 42
        self.cuda = False
        self.log_interval=200
        self.save='model.pt'
        self.onnx_export = ''
        self.nhead = 2
        self.dry_run = True

args = _parser()
print(dir(args))
print(args.data)

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'batch_size', 'bptt', 'clip', 'cuda', 'data', 'dropout', 'dry_run', 'emsize', 'epochs', 'log_interval', 'lr', 'model', 'nhead', 'nhid', 'nlayers', 'onnx_export', 'save', 'seed', 'tied']
/Users/_asakawa/study/2020pytorch_examples.git/word_language_model/data/wikitext-2/
# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")
###############################################################################
# Load data
###############################################################################

import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids

corpus = Corpus(args.data)
#corpus = data.Corpus(args.data)
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)
!pwd
!gln -s /Users/_asakawa/study/2020pytorch_examples.git/word_language_model/model.py ./pytorch_official_model.py
import pytorch_official_model as model
/Users/_asakawa/study/2022ccap/notebooks
gln: failed to create symbolic link './pytorch_official_model.py': File exists
###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
if args.model == 'Transformer':
    model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device)
else:
    model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)

criterion = nn.NLLLoss()
###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)


# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(args.bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    if args.model != 'Transformer':
        hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, args.bptt):
            data, targets = get_batch(data_source, i)
            if args.model == 'Transformer':
                output = model(data)
                output = output.view(-1, ntokens)
            else:
                output, hidden = model(data, hidden)
                hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output, targets).item()
    return total_loss / (len(data_source) - 1)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    if args.model != 'Transformer':
        hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't,

In [None]:
Not_list = []
for ch in kuten_chars:
    if not ch in jchar_list:
        Not_list.append(ch)
    else:
        jchar_list.append(ch)

print(len(Not_list))
#kuten_chars
#print("".join([ch for ch in joyo_chars]))
print(f'len(jchar_list):{len(jchar_list)}')
print(f'jchar_list:{"".join([ch for ch in jchar_list])}')