<a href="https://colab.research.google.com/github/Tyanakai/transformer_from_scratch/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
class Config:
    train_file = "date.txt"


In [3]:
DRIVE = "/content/drive/MyDrive/portfolio/transformer_study"


## preprocess data

In [4]:
with open(os.path.join(DRIVE, Config.train_file), mode="r") as f:
    text = f.read()

In [6]:
text[:100]

'september 27, 1994           _1994-09-27\nAugust 19, 2003              _2003-08-19\n2/10/93           '

In [7]:
text_x = []
text_y = []

for line in text.split("\n")[:-1]:
    text_x.append(line[:-11].lower().lstrip())
    text_y.append(line[-10:].lstrip())

In [8]:
text_x[-1]

'thursday, november 20, 1980  '

In [9]:
text_y[-1]

'1980-11-20'

In [25]:
class Tokenizer():

    def __init__(self, text_list):
        self.text_list = text_list


    def pad_text(self, text):
        # 文末の空白をpad文字で埋める
        last_char_idx = len(text.strip())
        text = text[:last_char_idx] + "＠" * (len(text) - last_char_idx)
        return text


    def create_char_list(self):
        # 入力文字列を文字に分解する
        self.char_list = []
        for text in self.text_list:
            text = self.pad_text(text) # 文末の空白をpad文字で埋める
            self.char_list.append(list(text)) # 文字に分解しリスト化

    
    def create_char_id_dict(self):
        # 出現文字にidを対応させる
        self.id_char_dict = dict()
        self.char_id_dict = dict()
        self.unique_char = np.unique(self.char_list)
        
        for id, c in enumerate(self.unique_char):
            self.id_char_dict[id] = c
            self.char_id_dict[c] = id


    def attention_mask(self):
        # attention_maskを作る
        attention_mask = []
        for line in self.char_list:
            chars = np.array(line) # 文字リストをnp.array化
            attention_mask.append((chars != "＠") * 1) # 文字が＠ではない場所が1となる
        return np.array(attention_mask)


    def tokenize(self):
        # 文字列を文字に分解しリスト化
        self.create_char_list()
        
        # 出現文字にidを対応させる
        self.create_char_id_dict()

        # 文字をidに変換する
        token_list = []
        for text in self.text_list:
            token_list.append([self.char_id_dict[c] for c in text])

        return np.array(token_list)


    def detokenize(self, token_list):
        for line in token_list:
            char_list = [self.id_char_dict[t] for t in line]
        return char_list

In [12]:
encoder_tokenizer = Tokenizer(text_x)
encoder_token_list = encoder_tokenizer.tokenize()
encoder_attention_mask = encoder_tokenizer.attention_mask()
encoder_num_char = encoder_tokenizer.unique_char.shape[0]

decoder_tokenizer = Tokenizer(text_y)
decoder_token_list = decoder_tokenizer.tokenize()
decoder_attention_mask = decoder_tokenizer.attention_mask()
decoder_num_char = decoder_tokenizer.unique_char.shape[0]

## model