# Improved GPT

このファイルではGPT from scratchで学習したモデルの改善を目標とする。このファイルの立ち位置は同リポジトリに含まれている
- pytorch_command.ipynb
- attention_from_scratch.ipynb
- GPT_from_scratch.ipynb<br>
の次に読むことを想定されている。

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
import warnings
warnings.simplefilter('ignore')
print("CUDA環境が壊れていないことを祈りながら確認-> ", torch.cuda.is_available())

CUDA環境が壊れていないことを祈りながら確認->  True


### PreLN
GPT from scratchでは、学習中に勾配が変化しない状態がいつまで経っても続く時があった。<br>
これはLayer Normalization層がMultiheadAttentionやFFN層の後に置かれるというPostLNの形をとっているからであると最近では言われている<br>
このため、改善点の一つとして、サブレイヤーの前にLayer Normalization層を置くPreLNを行う。<br>
最近のGPTモデルではPreLNを行なっているようである。

In [2]:
class PreLNGPTDecoderLayer(nn.Module):
    def __init__(self, embedding_dim, ffn_dim, num_heads, drop_out_rate = 0., layer_eps=1e-05, batch_first = False):
        super().__init__()
        self.maskedmultiheadattention = nn.MultiheadAttention(embedding_dim, num_heads,batch_first=batch_first)
        self.dropout_selfattn = nn.Dropout(p = drop_out_rate)
        self.layernorm_selfattn = nn.LayerNorm(embedding_dim, eps = layer_eps)

        self.ffn = nn.Sequential(nn.Linear(embedding_dim, ffn_dim), nn.GELU(), nn.Linear(ffn_dim, embedding_dim))#GELUに変更
        self.layernorm_ffn = nn.LayerNorm(embedding_dim, eps = layer_eps)
        self.dropout_ffn = nn.Dropout(p = drop_out_rate)

    def forward(self, x, pad_mask_self = None, mask_self=None):
        #PreLNにする
        dx = self.layernorm_selfattn(x)

        dx, _ = self.maskedmultiheadattention(dx,dx,dx,key_padding_mask = pad_mask_self, attn_mask = mask_self)

        dx = self.dropout_selfattn(dx)

        x = x+dx

        dx = self.layernorm_ffn(x)

        dx = self.dropout_ffn(self.ffn(dx))

        x = x + dx
        return x

### 改善したGPT

このPreLNGPTDecoderLayerを用いてGPTモデルを制作する。ところで、今回はGPTのgenerate_sentence関数でもtemperatureとtopKという手法を追加した。<br>
これにより多様性のある文章が生成される。

In [3]:
class GPT(nn.Module):
    def __init__(self, vocab_size, embedding_dim, ffn_dim, num_heads, drop_out_rate = 0.,\
                  layer_eps=1e-05, batch_first = False, T = 10000, N = 1):
        super().__init__()
        #Tはmax_lenを表している
        self.embedding = nn.Embedding(vocab_size, embedding_dim,)
        self.positional_embedding = nn.Embedding(T, embedding_dim)
        self.decoder = nn.ModuleList([PreLNGPTDecoderLayer(embedding_dim, ffn_dim, num_heads, drop_out_rate,\
                                                               layer_eps, batch_first) for _ in range(N)])
        self.linear = nn.Linear(embedding_dim, vocab_size, bias = False)
        self.vocab_size = vocab_size
    def forward(self, x, y = None,pad_mask_self = None, mask_self=None):
        """
        yはxを1つだけずらしたデータである
        x = data[a:b]なら、y = data[a+1:b+1]となる。
        """
        x = self.embedding(x)
        pos = torch.arange(0,x.size(1),dtype=torch.long).unsqueeze(0).to(x.device)
        pos = self.positional_embedding(pos)
        x = x + pos
        for layer in self.decoder:
            x = layer(x, pad_mask_self = pad_mask_self, mask_self = mask_self)
        x = self.linear(x)
        if y != None:
            loss = F.cross_entropy(x.view(-1, x.size(-1)), y.view(-1), ignore_index=-1) 
            #ignore_index=-1はyをonehotベクトル化しないでcross_entropyを使うために使用
            pred = x.argmax(dim = -1).detach().cpu()
            return loss,pred
        loss = None
        pred = x[:,[-1],:]
        return loss, pred
    def create_mask(self, x: torch.tensor, x_pad: int, device: str):
        """
        (batch_size, sequence_length, embedding_dim)の入力を想定
        """
        """
        Trueが無視される値であることに注意すること
        """
        seq_len = x.size(1)
        #srcのマスク制作
        padding_mask = (x == x_pad)
        mask = torch.triu(torch.ones(size = (seq_len, seq_len))==1).transpose(0,1) #下三角行列を作る
        mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask==1.,float(0.0)).to(device)
        return padding_mask, mask

    @torch.no_grad()
    def generate(self,bos: str, sentence_size, tokenizer, device):
        self.eval()
        bos_tokenized = tokenizer.encode_ordinary(bos)
        bos_tokenized = bos_tokenized[-sentence_size:]
        bos_tokenized = torch.LongTensor([bos_tokenized])
        _, add_sentence = self(bos_tokenized.to(device))
        self.train()
        return add_sentence
    
    @torch.no_grad()
    def generate_sentence(self, bos: str, sentence_size, generate_tokens, tokenizer, device, top_K = None, temperature = 1.0):
        return_sentence = bos
        for i in range(generate_tokens):
            add_sentence = self.generate(return_sentence, sentence_size, tokenizer,device)
            add_sentence = add_sentence[:,-1,:] / temperature #(1, vocab_size)
            if top_K is not None:
                v, _ = torch.topk(add_sentence, min(top_K, add_sentence.size(-1)))
                #v[:, [-1]]がtopkの中でも最小値を取る。これより小さいやつは予想に含めない。
                add_sentence[add_sentence < v[:, [-1]]] = -float('Inf')
            probs = F.softmax(add_sentence, dim = -1)
            idx_next = torch.multinomial(probs, num_samples=1)
            return_sentence += tokenizer.decode_batch(idx_next.tolist())[0]
        return return_sentence

In [4]:
train_data = np.memmap("bin/train.bin", dtype = np.uint16, mode = "r")
val_data = np.memmap("bin/val.bin", dtype = np.uint16, mode = "r")

In [5]:
#import tiktoken
#tokenizer = tiktoken.get_encoding("gpt2")
#print("消すべきエスケープシーケンス",tokenizer.encode_ordinary_batch(["\r","\n","\/","\0","\b","\t"]))

In [6]:
#iteration = 1024
#index_train = 0
#index_val = 0
#delete_tokens = [201,198,11139,188,196,197]
#for iter_index in tqdm(range(iteration)):
#    add_train = np.array_split(train_data,iteration)[iter_index]
#    add_val = np.array_split(val_data,iteration)[iter_index]
#    add_train = add_train[~np.isin(add_train,delete_tokens)]
#    add_val = add_val[~np.isin(add_val,delete_tokens)]
#    index_train += len(add_train)
#    index_val += len(add_val)
#print(index_train, index_val)
#上での作業はindex取得のための作業。
#train_processed = np.memmap("train_processed.bin",dtype=np.uint16,mode="write",shape = (index_train,))
#val_processed = np.memmap("val_processed.bin",dtype=np.uint16,mode="write",shape = (index_val,))
#iteration = 1024
#index_train = 0
#index_val = 0
#for iter_index in tqdm(range(iteration)):
#    add_train = np.array_split(train_data,iteration)[iter_index]
#    add_val = np.array_split(val_data,iteration)[iter_index]
#    add_train = add_train[~np.isin(add_train,delete_tokens)]
#    add_val = add_val[~np.isin(add_val,delete_tokens)]
#    train_processed[index_train:index_train+len(add_train)] = add_train
#    val_processed[index_val:index_val+len(add_val)] = add_val
#    index_train += len(add_train)
#    index_val += len(add_val)
#train_processed.flush()
#val_processed.flush()

In [7]:
train_data = np.memmap("bin/train_processed.bin", dtype = np.uint16, mode = "r")
val_data = np.memmap("bin/val_processed.bin", dtype = np.uint16, mode = "r")

In [8]:
sentence_size = 256
batch_size = 29
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark=True
torch.backends.cudnn.allow_tf32 = True
def get_batch(split: str, batch_size = 256,device = "cpu")->torch.Tensor:
    data = train_data if split == 'train' else val_data
    index = torch.randint(len(data) - sentence_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+sentence_size]).astype(np.int64)) for i in index])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+sentence_size]).astype(np.int64)) for i in index])
    if device == "cuda":
        return x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    return x.to(device), y.to(device)

In [9]:
import tiktoken
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_size = 768
num_heads = 6
tokenizer = tiktoken.get_encoding("gpt2")
#KarpathyのminGPTを参考に、パラメーターを設定した。
depth = 6
gpt = GPT(50257, embedding_size, embedding_size*4, num_heads, 0, batch_first=True, T = sentence_size, N = depth).to(device) 
#事前学習のときはDropout無し、ファインチューニングのときはありが好ましい
warmup_iters = 2000

optimizer = torch.optim.Adam(gpt.parameters(), lr = 0.0001)

In [10]:
max_lr = 4.0e-5
min_lr = 2.5e-6
max_iters = 10000
def get_lr(cur_iter):
    #cur_iter現在のiteration
    if cur_iter < warmup_iters:
        return max_lr * cur_iter / warmup_iters
    return (max_lr * (np.cos(cur_iter / max_iters * np.pi) + 1)).clip(min_lr, max_lr)

In [11]:
"""最初の訓練時のコードでは以下のように初期化を行なう。
import gc
from tqdm import tqdm
batch_iteration = 128
scaler = torch.cuda.amp.GradScaler(enabled=True)
best_loss = 1e9
begin = 0
val_iteration = 1
"""

'最初の訓練時のコードでは以下のように初期化を行なう。\nimport gc\nfrom tqdm import tqdm\nbatch_iteration = 128\nscaler = torch.cuda.amp.GradScaler(enabled=True)\nbest_loss = 1e9\nbegin = 0\nval_iteration = 1\n'

In [12]:
import gc
from tqdm import tqdm
checkpoint = torch.load("best_checkpoint.bin", map_location="cpu") #チェックポイントがあるなら使う
batch_iteration = 256
scaler = torch.cuda.amp.GradScaler(enabled=True) #defaultだとinitが大きすぎるので治す
best_loss = checkpoint["best_loss"]
begin = checkpoint["iter"]
val_iteration = 1
gpt.load_state_dict(checkpoint["model"])
optimizer.load_state_dict(checkpoint["optimizer"])
del checkpoint
gc.collect()
torch.cuda.empty_cache()

for cur_iter in tqdm(range(begin,max_iters)):
    optimizer.lr = get_lr(cur_iter+1)
    for batch_iter in range(batch_iteration):
        optimizer.zero_grad()
        with torch.amp.autocast(device_type=device, dtype=torch.bfloat16):
            x,y = get_batch("train",batch_size=batch_size,device=device)
            padding_mask, mask = gpt.create_mask(x, 0, device)
            loss, pred = gpt(x,y,padding_mask,mask)
        scaler.scale(loss).backward() 
        scaler.step(optimizer) 
        scaler.update()
        del x, y
        del padding_mask, mask
        del loss
        del pred
        gc.collect()
        torch.cuda.empty_cache()
    valid_loss = 0
    for val_iter in range(val_iteration):
        with torch.no_grad(): #こうしないとCUDAERRORが起きる
            with torch.amp.autocast(device_type=device, dtype=torch.bfloat16):
                x,y = get_batch("valid",batch_size=batch_size,device=device)
                padding_mask, mask = gpt.create_mask(x, 0, device)
                loss, pred = gpt(x,y,padding_mask,mask)
                valid_loss += loss.detach()
                del loss
                gc.collect()
                torch.cuda.empty_cache()
        if best_loss > valid_loss.item() / val_iteration:
            best_loss = valid_loss.item() / val_iteration
            checkpoint = {
                "model": gpt.state_dict(),
                "optimizer": optimizer.state_dict(),
                "scaler": scaler,
                "iter": cur_iter,
                "best_loss": best_loss,
            }
            torch.save(checkpoint, "best_checkpoint.bin")
            print("params updated. BestLoss: ", best_loss)
            with open("learning_detail.txt","w") as f:
                f.write("学習状況\n")
                f.write(f"iter: {cur_iter}\n")
                f.write(f"hyper params: \n")
                f.write(f"vocab_size: 50257, embedding size: {embedding_size}, ffn: {embedding_size*4}, num_heads: {num_heads}, Depth: {depth}, sentnce_size: {sentence_size}\n")
                f.write(f"lr: {optimizer.lr},best_loss: {best_loss}\n")
                f.close()
        del x, y
        del padding_mask, mask
        del pred
        gc.collect()
        torch.cuda.empty_cache()
    if torch.isnan(valid_loss):
        print("Loss is NaN!")
        break
    checkpoint = {
    "model": gpt.state_dict(),
    "optimizer": optimizer.state_dict(),
    "scaler": scaler,
    "iter": cur_iter,
    "best_loss": best_loss,
    "loss": valid_loss.item()
    }
    torch.save(checkpoint, "latest_checkpoint.bin")
    with open("learning_detail_latest.txt","w") as f:
        f.write("学習状況\n")
        f.write(f"iter: {cur_iter}\n")
        f.write(f"hyper params: \n")
        f.write(f"vocab_size: 50257, embedding size: {embedding_size}, ffn: {embedding_size*4}, num_heads: {num_heads}, Depth: {depth}, sentnce_size: {sentence_size}\n")
        f.write(f"lr: {optimizer.lr},best_loss: {best_loss}\n")
        f.write(f"val_loss: {valid_loss.item()}\n")
        f.close()
        del valid_loss
        gc.collect()
        torch.cuda.empty_cache()

  0%|          | 0/9916 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.39 GiB (GPU 0; 7.75 GiB total capacity; 5.90 GiB already allocated; 1.19 GiB free; 6.42 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
checkpoint = torch.load("best_checkpoint.bin", map_location="cpu")

In [None]:
checkpoint

{'model': OrderedDict([('embedding.weight',
               tensor([[ 1.8859,  1.5174,  0.9148,  ..., -1.5706, -0.4753,  0.5954],
                       [ 0.3046, -3.0930, -1.4306,  ...,  0.2255,  0.3209,  1.2896],
                       [-0.8882, -0.6506, -0.1566,  ..., -0.0855,  1.2902,  1.1102],
                       ...,
                       [-0.1957, -0.5788, -0.8011,  ...,  0.5110, -0.0158,  0.4411],
                       [ 0.9012,  0.6003, -0.3570,  ...,  1.2942, -0.3836,  0.5536],
                       [ 0.3491, -1.0713, -0.9490,  ..., -1.9283, -0.2922,  0.0994]])),
              ('positional_embedding.weight',
               tensor([[-0.4624, -0.9417, -1.4833,  ..., -0.0771,  0.7658, -0.4959],
                       [-0.4813, -0.2662,  0.8626,  ..., -1.8311, -1.4273, -0.8429],
                       [-1.0187,  0.7702, -0.4985,  ...,  0.6719, -2.3694, -1.2789],
                       ...,
                       [ 0.3733, -0.4403, -1.0542,  ..., -0.6353,  1.7770, -0.4963],
 

In [None]:
gpt.load_state_dict(checkpoint["model"])

<All keys matched successfully>

In [None]:
gpt.generate_sentence("This Island is once ", sentence_size, 256, tokenizer,device,top_K=1000,temperature = 5)

'This Island is once orueteri ge; soon stands sene tilau had most now stuck hat first bareii ne_ci ab isor good no matches very wrong.: simply such swings per term within 11 symare will quite only \'dark coinsbaked ones ** use do dat so my heart→ too arenTtr Kobman Sukod* Joinedin god? How easy Is sc2ound the print press mov jory handed sides new breed� biann would fall sh*** ........huda lambaiO . However 18 trazy � stegt sa winf and repasion Ctctatic catir - 12 often \'eg a matterkeeper times {Gruff mark gets wontobs saidare ruberg ca] heohipersHeex aniem** November var up dude flint-base lksweow die ?why yeah....... 18Ebooks about le jugz please copy reheennis against del utener?" let over or:i -> bellan\', call pic iraches around later date� pakeha keaknin double: world > 20 Aprilim% lilkun boy type beiltoit ar ("9 mid naji fore� key, shout der function than when sur dat imentide cut … both.\'\' what remains with too but since payway'