<a href="https://colab.research.google.com/github/ShinAsakawa/ShinAsakawa.github.io/blob/master/2023notebooks/2023_1211SNOW_transformer_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%config InlineBackend.figure_format = 'retina'
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

from IPython import get_ipython
isColab =  'google.colab' in str(get_ipython())

if isColab:

    # GPU 情報を表示
    !nvidia-smi -L

    # `import bit` する前に termcolor を downgrade しないと colab ではテキストに色がつかない
    !pip install --upgrade termcolor==1.1
    import termcolor

    # 日本語 transformer をインストールするためには，以下のインストールが必要
    !pip install --upgrade xlrd
    !pip install --upgrade 'fugashi[ipadic]'
    !pip install --upgrade 'fugashi[unidic]'
    !python -m unidic download
    !pip install --upgrade ipadic
    !pip install --upgrade transformers
    !pip install --upgrade termcolor
    !pip install --upgrade jaconv
    !pip install jaconv
    #!git clone https://github.com/ShinAsakawa/RAM.git

import platform
HOSTNAME = platform.node().split('.')[0]

import os
HOME = os.environ['HOME']

import sys
from collections import OrderedDict

try:
    import ipynbname
except ImportError:
    !pip install ipynbname
    import ipynbname
FILEPATH = str(ipynbname.path()).replace(HOME+'/','')

import pwd
USER=pwd.getpwuid(os.geteuid())[0]

from datetime import date
TODAY=date.today()

import torch
TORCH_VERSION = torch.__version__

from termcolor import colored

try:
    import japanize_matplotlib
except ImportError:
    !pip install japanize_matplotlib
    import japanize_matplotlib

from tqdm.notebook import tqdm

color = 'green'
print('日付:',colored(f'{TODAY}', color=color, attrs=['bold']))
print('HOSTNAME:',colored(f'{HOSTNAME}', color=color, attrs=['bold']))
print('ユーザ名:',colored(f'{USER}', color=color, attrs=['bold']))
print('HOME:',colored(f'{HOME}', color=color,attrs=['bold']))
print('ファイル名:',colored(f'{FILEPATH}', color=color, attrs=['bold']))
print('torch.__version__:',colored(f'{TORCH_VERSION}', color=color, attrs=['bold']))

In [None]:
if isColab:
     !git clone https://github.com/ShinAsakawa/RAM.git
!ls -lt RAM

In [3]:
from google.colab import files
uploaded = files.upload()

Saving transformer.py to transformer.py


In [4]:
!mv transformer.py RAM/

In [5]:
import os
import pandas as pd
import requests
from termcolor import colored
import jaconv

# やさしい日本語をダウンロード
SNOWs={'T15': {'url':"https://filedn.com/lit4DCIlHwxfS1gj9zcYuDJ/SNOW/T15-2020.1.7.xlsx"},
       'T23': {'url':"https://filedn.com/lit4DCIlHwxfS1gj9zcYuDJ/SNOW/T23-2020.1.7.xlsx"},
      }
print('エクセルファイル読込', end='...')
for corpus in SNOWs:
    url = SNOWs[corpus]['url']
    excel_fname = corpus + '-2020.1.7.xlsx'

    if not os.path.exists(excel_fname):  # ファイルが存在しない場合ダウンロード
        print(f'url:{url}')
        r = requests.get(url)
        with open(excel_fname, 'wb') as f:
            total_length = int(r.headers.get('content-length'))
            print(f'{excel_fname} をダウンロード中 {total_length} バイト')
            f.write(r.content)

    SNOWs[corpus]['df'] = pd.read_excel(excel_fname)
    SNOWs[corpus]['df'] = SNOWs[corpus]['df'].rename(columns={'#日本語(原文)': 'ja',
                                                              '#やさしい日本語':'easy_ja',
                                                              '#英語(原文)':'en'})
# 2 つのデータをあわせる
_snow = SNOWs['T15']['df']['ja'].tolist() + SNOWs['T23']['df']['ja'].tolist()
#_snow = SNOWs['T15']['df']['easy_ja'].tolist() + SNOWs['T23']['df']['easy_ja'].tolist()
snow = [jaconv.normalize(line, 'NFKC') for line in _snow] # 正規化

エクセルファイル読込...url:https://filedn.com/lit4DCIlHwxfS1gj9zcYuDJ/SNOW/T15-2020.1.7.xlsx
T15-2020.1.7.xlsx をダウンロード中 3634132 バイト
url:https://filedn.com/lit4DCIlHwxfS1gj9zcYuDJ/SNOW/T23-2020.1.7.xlsx
T23-2020.1.7.xlsx をダウンロード中 3641507 バイト


In [None]:
# snow = list(SNOWs['T15']['df']['ja'].to_list() + SNOWs['T23']['df']['ja'].to_list())
# snow[:3]

In [6]:
from transformers import EncoderDecoderModel, BertTokenizer, BertConfig
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

_bertmodel_name = 'bert-base-uncased'
sbertmodel_name = 'sonoisa/sentence-bert-base-ja-mean-tokens-v2'
tknz = BertTokenizer.from_pretrained(sbertmodel_name)

from RAM import Transformer
model = Transformer(src_vocab_size=tknz.vocab_size,
                    tgt_vocab_size=tknz.vocab_size,
                    #model_dim=256,
                    model_dim=384,
                    num_heads=4,
                    num_layers=2,
                    #num_layers=3,
                    max_seq_length=64,
                    dropout=0.,
                    #ff_dim=256,
                    ff_dim=384,
                   ).to(device)

class snow_Dataset(torch.utils.data.Dataset):
    def __init__(self,
                 data_list:list=snow):

        super().__init__()
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        sent = self.data_list[idx]
        return sent, sent

snow_ds = snow_Dataset()
ds = snow_ds
batch_size = 1024
batch_size = 128

def _collate_fn(batch):
    inps, tgts = list(zip(*batch))
    inps = list(inps)
    tgts = list(tgts)
    return inps, tgts

snow_dl = DataLoader(
    dataset=snow_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=_collate_fn)

dl = snow_dl
inp, tch = next(iter(dl))
encoded_input = tknz.batch_encode_plus(inp,
                                       padding="longest",
                                       truncation=True,
                                       return_tensors="pt").to(device)
print(encoded_input.input_ids.detach().cpu().numpy()[:3])
print(snow[:3])

tokenizer_config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/258k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


[[    2  2606     9   386   319     7  1147   322 28462   146    11    46
  28492 21198     8     3     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    2   395  1263    14   995    12  2916   322   787  1058 28565     8
      3     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    2 21254 28470 28458  4270 28710 28453  1406    21 16918 28449     8
      3     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]]
['誰が一番に着くか私には分かりません。', '多くの動物が人間によって滅ぼされた。', '私はテニス部員です。']


In [12]:
#help(torch.nn.Embedding)
model.eval()

Transformer(
  (encoder_embedding): Embedding(32000, 384)
  (decoder_embedding): Embedding(32000, 384)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-1): 2 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=384, out_features=384, bias=True)
        (W_k): Linear(in_features=384, out_features=384, bias=True)
        (W_v): Linear(in_features=384, out_features=384, bias=True)
        (W_o): Linear(in_features=384, out_features=384, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=384, out_features=384, bias=True)
        (fc2): Linear(in_features=384, out_features=384, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-1): 2 x DecoderLayer(
 

In [8]:
#state_dict = model.state_dict()
#state_dict = torch.load('2023_1212SNOW_transformer.pt')
#state_dict
#model.load_state_dict(state_dict)

In [9]:
# #model.state_dict == state_dict
# for module, param in state_dict.items():
#     print(str(module),
#           (state_dict[module].data == model.state_dict[module].data).sum().detach().numpy() ==
#           state_dict[module].data .detach().numpy().size)

In [9]:
def save_checkpoint(checkpoint_path, model):
    state = {'state_dict': model.state_dict() }
    torch.save(state, checkpoint_path)

def load_checkpoint(checkpoint_path, model):
    state = torch.load(checkpoint_path)
    model.load_state_dict(state['state_dict'])
    print(f'model loaded from {checkpoint_path}')

In [17]:
%%time
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
# [Adam](https://arxiv.org/abs/1412.6980) による最適化関数の定義
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)  # 最適化関数を初期化
#optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

epochs = 100
epochs = 30
epochs = 10
for epoch in range(epochs):
    epoch_loss = 0.

    loop = tqdm(dl, leave=True)
    for batch in loop:
    #for inp, tch in tqdm(dl):
        inp = batch[0]
        encoded_input = tknz.batch_encode_plus(inp,
                                               padding="longest",
                                               truncation=True,
                                               return_tensors="pt").to(device)

        optimizer.zero_grad()
        output = model(src=encoded_input.input_ids,
                       tgt=encoded_input.input_ids).to(device)
        loss = criterion(output[0], encoded_input.input_ids[0])  # 損失値の計算
        for h in range(1,len(output)):
            loss += criterion(output[h], encoded_input.input_ids[h])

        loss.backward()                      # 誤差逆伝播
        epoch_loss += loss.item()            # 損失値総和
        optimizer.step()                     # 誤差に基づき学習ステップ実行

        loop.set_description(f'エポック {epoch}')
        loop.set_postfix(OrderedDict(loss=loss.item()/len(inp)))

    #print(f'loss:{epoch_loss/ds.__len__()}')
    print(f'epoch:{epoch:03d}',
          f'eopch_loss:{epoch_loss/ds.__len__():.5f}')
          #f'出力:{tknz.convert_ids_to_tokens(output_ids)}')
    checkpoint_fname = f'2023_1211SNOW_transfomer_epoch{epoch:02d}.pt'
    save_checkpoint(checkpoint_fname, model)

  0%|          | 0/659 [00:00<?, ?it/s]

epoch:000 eopch_loss:0.00160


  0%|          | 0/659 [00:00<?, ?it/s]

epoch:001 eopch_loss:0.00075


  0%|          | 0/659 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [16]:
fname_saved = '2023_1211snow_transformer_gpu.pt'
save_checkpoint(fname_saved, model)

from google.colab import files
files.download(fname_saved)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
from google.colab import drive
drive.mount('/content/drive')

!cp 2023_1211snow_transformer_gpu.pt /content/drive/My\ Drive/2023_1211snow_transformer_gpu2.pt


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
encoded_input = tknz.batch_encode_plus(inp[:3],
                                       padding="longest",
                                       truncation=True,
                                       return_tensors="pt").to(device)
print(tknz.convert_ids_to_tokens(encoded_input.input_ids.squeeze(0)[1]))
print(inp[:3])
#encoded_input.input_ids

In [None]:
model.eval()
n_corrects, total, results = 0, 0, {}
isPrint = False
loop = tqdm(snow, leave=True)
for N in loop:
#for N in tqdm(snow, leave=True)
    total += 1
    encoded_input = tknz.batch_encode_plus([N],
                                           padding="longest",
                                           truncation=True,
                                           return_tensors="pt").to(device)
    output = model(src=encoded_input.input_ids,
                   tgt=encoded_input.input_ids).to(device)
    #print(tknz.convert_ids_to_tokens(output.squeeze(0).topk(1)[1]), inp[N])
    outstr = "".join(tknz.convert_ids_to_tokens(output.squeeze(0).topk(1)[1][1:-1])).replace('##','')

    yesno = outstr == N
    if yesno == True:
        n_corrects += 1
    else:
        results[total] = {'正誤':yesno, '出力':outstr, '教師':N}
        if isPrint:
            print(f'{total:3d}:{yesno} N:{N}, outstr:{outstr}')

    loop.set_description('正解率')
    loop.set_postfix(OrderedDict(cr=n_corrects/total))

print(n_corrects) # , N)
#output.size()

In [None]:
model.eval()
torch.save(model.state_dict(), '2023_1217snow_transformer.pt')

In [None]:
for N in range(10):
    s = snow[N]
    encoded_input = tknz.batch_encode_plus([s],
                                           padding="longest",
                                           truncation=True,
                                           return_tensors="pt").to(device)
    output = model(src=encoded_input.input_ids,
                   tgt=encoded_input.input_ids).to(device)
    outstr = "".join(tknz.convert_ids_to_tokens(output.squeeze(0).topk(1)[1][1:-1])).replace('##','')
    yesno = outstr == s
    print(N, yesno, outstr, s)


In [None]:
import pandas as pd
import jaconv

#terao2 = pd.read_excel('語彙代用　機械学習用２.xlsx')
terao2 = pd.read_excel('語彙代用タテ型２.xlsx')
terao1 = pd.read_excel('語彙代用ヨコ型２.xlsx')
for x in terao1.iterrows():
    print(len(x), type(x[0]), len(x[1]))
    break

In [None]:
from collections import OrderedDict
terao_vert = terao2[['タテ型','意図','誤り']].to_dict(orient='index')
terao_hori = terao1[['ヨコ型','意図','誤り']].to_dict(orient='index')
#['dict', 'list', 'series', 'split', 'tight', 'records', 'index']
#help(terao1[['ヨコ型','意図','誤り']].to_dict)
#help(pd.read_excel)
#print(terao_hori[0])
X = OrderedDict()
for _x in terao1.iterrows():
    print(type(_x), type(_x[1]))
    break
    # for __x in _x[1:]:
    #     X[k] = []
    #     #print(k, v, jaconv.normalize(v))
    #     #X[k].append(jaconv.normalize(__x))
    #     print(type(__x))
    #     break
    # break

In [None]:
print(tknz.cls_token_id, tknz.pad_token_id)
for k in dir(tknz):
    if '_token_id' in str(k):
        print(k)

2 0
bos_token_id
cls_token_id
eos_token_id
mask_token_id
pad_token_id
sep_token_id
unk_token_id


In [None]:
#for i in range(5):
#    print(i+1, tknz.vocab[i+1])
list(tknz.vocab.keys())[:10]