## 参考にしたサイト

### transformetの解説で最もわかりやすい
https://developers.agirobots.com/jp/multi-head-attention/


### 全体のコードの構造
https://tech.gmogshd.com/transformer/

### Embeddingの解説(単語→数値)
https://gotutiyan.hatenablog.com/entry/2020/09/02/200144

In [1]:
with open('./data.txt','r',encoding='utf-8') as f:
        text = f.read()        
print("テキストの文字数 :", len(text))
print("最初の30文字 : ",text[:30])

テキストの文字数 : 1063
最初の30文字 :  Head Mounted Displayをはじめとした立体視


In [2]:
import torch
import torch.nn as nn

# 使用されている文字
chars = sorted(list(set(text)))
print(chars[:30],chars[50:70])
# 使用されている文字数
char_size = len(chars)

# 文字と数字を一対一対応させる辞書
char2int = { ch : i for i, ch in enumerate(chars) }
int2char = { i : ch for i, ch in enumerate(chars) }

# 文字と数字を変換する関数
encode = lambda a: [char2int[b] for b in a ]
decode = lambda a: ''.join([int2char[b] for b in a ])
print("decode_example:",decode([40,2,5,8,23,56]))

# テキストファイルを数字にして，tensor型に変換
train_data = torch.tensor(encode(text), dtype=torch.long)
print(train_data.shape)
print(train_data[:20])

['\n', ' ', '%', '(', ')', '-', '.', '3', 'A', 'C', 'D', 'F', 'G', 'H', 'L', 'M', 'N', 'P', 'S', 'T', 'U', 'Y', '\\', 'a', 'b', 'c', 'd', 'e', 'g', 'h'] ['か', 'が', 'き', 'く', 'こ', 'さ', 'し', 'じ', 'す', 'そ', 'た', 'っ', 'つ', 'て', 'で', 'と', 'ど', 'な', 'に', 'の']
decode_example: u%-Aaし
torch.Size([1063])
tensor([13, 27, 23, 26,  1, 15, 35, 40, 34, 39, 27, 26,  1, 10, 30, 38, 36, 32,
        23, 43])


In [3]:
class SelfAttention_Head(nn.Module):

    def __init__(self, n_mbed, head_size, block_size):
        super().__init__()
        self.key = nn.Linear(n_mbed, head_size, bias=False)
        self.query = nn.Linear(n_mbed, head_size, bias=False)
        self.value = nn.Linear(n_mbed, head_size, bias=False)
        # 上三角をゼロに，下三角をそのまま
        # 大きいサイズの行列
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))


    ## channelは文字を表現する次元数
    ## Tは文章の長さに相当，足りない部分はpaddingで追加
    ## Bはバッチサイズ，長さが違う文章でもmaskすることで対応している
    def forward(self, x):
        # (Batch_size,data,Channel)
        B, T, C = x.shape
        print(f"B:{B}, T:{T}, C:{C}")

        k = self.key(x)
        # print("k",k)
        q = self.query(x)
        # print("q",q)
        v = self.value(x)
        # print("v",v)

        #  softmaxの中身計算
        wei = q @ k.transpose(-2,-1)*  (C ** -0.5)
        # print(wei)

        # 必要サイズの下三角行列を作成
        # 0に相当する部分を-infで置き換える
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        # print(wei)
        
        # 行列の行でsoftmax演算
        wei = nn.functional.softmax(wei, dim=-1)
        print("wei:",wei.shape)

        out = wei @ v
        print("out:",out.shape)
        
        return out

In [4]:
a = torch.tril(torch.ones(4,4))
t = 3
print(a[:t,:t])
print(a[:t,:t]==0)

import numpy as np
A = np.array([[1,2,3],[4,5,6]])
B = np.array([[1,2],[3,4],[5,6]])
C = 2

D = torch.tensor(A @ B * C ** -0.5)
D = nn.functional.softmax(D, dim=-1)
print(D)

E = torch.tril(torch.ones(2,2))
F = D.masked_fill(E == 0,float("-inf"))
print(D)
print(E)
print(F)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[False,  True,  True],
        [False, False,  True],
        [False, False, False]])
tensor([[1.4166e-02, 9.8583e-01],
        [2.4751e-05, 9.9998e-01]], dtype=torch.float64)
tensor([[1.4166e-02, 9.8583e-01],
        [2.4751e-05, 9.9998e-01]], dtype=torch.float64)
tensor([[1., 0.],
        [1., 1.]])
tensor([[1.4166e-02,       -inf],
        [2.4751e-05, 9.9998e-01]], dtype=torch.float64)


In [5]:
vector_size = 5

## embedding　サイト：https://gotutiyan.hatenablog.com/entry/2020/09/02/200144
# [単語数] → [単語数，次元数(vector_size)]
embeddings = nn.Embedding(char_size, vector_size)

# e.g. ホログラフィをベクトルにする
encoded_words = torch.tensor(encode("ホログラフィ"))
embeddings_words  = embeddings(encoded_words)
print("[ホログラフィ]のベクトル表現 : \n",embeddings_words)


### 次元を揃える
embeddings_words = embeddings_words.unsqueeze(dim = 0)
print(embeddings_words.shape)



## block_sizeは文章の長さよりも長くする必要がある
attention_head = SelfAttention_Head(n_mbed=vector_size,head_size=3,block_size=embeddings_words.size(1))
attention_head.forward(embeddings_words)

[ホログラフィ]のベクトル表現 : 
 tensor([[-0.0927,  0.2632, -0.7751,  1.6746, -1.0497],
        [ 0.3672,  0.9759, -0.1068,  0.0722,  1.0826],
        [-0.2615,  1.6533,  0.1494,  0.9063, -1.1628],
        [-0.8144,  0.5274, -1.4662,  1.9376,  0.1823],
        [ 0.5537,  0.2203, -2.7770, -1.5073,  0.0993],
        [ 1.0010, -1.4937, -1.1882,  0.0547,  0.0086]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([1, 6, 5])
B:1, T:6, C:5
wei: torch.Size([1, 6, 6])
out: torch.Size([1, 6, 3])


tensor([[[-0.2684,  0.2155,  0.5749],
         [ 0.1985,  0.1922,  0.2134],
         [ 0.3612,  0.1884,  0.0345],
         [ 0.2322,  0.2687,  0.2607],
         [ 0.1387,  0.3823,  0.3602],
         [ 0.0209,  0.4256,  0.4888]]], grad_fn=<UnsafeViewBackward0>)

In [6]:
class SelfAttention_MultiHeads(nn.Module):

    def __init__(self, n_mbed, num_heads, head_size, block_size):
        super().__init__()
        ##      (32,8,8)
        self.heads = nn.ModuleList((SelfAttention_Head(n_mbed, head_size, block_size) for _ in range(num_heads)))

    def forward(self, x):
        
        print("-------------SelfAttention_MultiHeads-------------")
        print("selfattention_multihead",self.heads[0](x).shape)
        print("----------------------------------------------------")


        return torch.cat([h(x) for h in self.heads], dim = -1)

class FeedForward(nn.Module):

    def __init__(self, n_mbed):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_mbed, n_mbed), nn.ReLU())

    def forward(self, x):
        return self.net(x)


In [10]:
class Model(nn.Module):
    def __init__(self, n_mbed, char_size, block_size, number_of_heads):
        super().__init__()
        
        ## 文字を数字に置き換える
        self.token_embedding = nn.Embedding(char_size, n_mbed)

        ## blockの位置をベクトル数字に置き換える
        self.position_embedding = nn.Embedding(block_size, n_mbed)

        ## (32,4,8,8)
        self.selfattention_multiheads = SelfAttention_MultiHeads(n_mbed, number_of_heads, n_mbed//number_of_heads, block_size)

        self.feedforward = FeedForward(n_mbed)

        self.linear = nn.Linear(n_mbed , char_size)


    def forward(self, idx, targets=None):
        B, T= idx.shape
        print("B:",B,"T:",T)

        ## 単語の数値変換
        token_mbed = self.token_embedding(idx)
        print("token_mbed:",token_mbed.shape)

        ## ポジションの数値変換
        position_mbed = self.position_embedding(torch.arange(T))
        print("position_mbed.shape:",position_mbed.shape)
        print("position_mbed",position_mbed)
        print()

        ## 単語ベクトルとポジションベクトルを足す
        x = token_mbed + position_mbed        

        ## multiheadに代入
        ## 複数のmulti-headをconcat
        x = self.selfattention_multiheads(x)
        print("x(self_attention_multiheads):",x.shape)
        
        ## feedforwardして非線形性を獲得
        x = self.feedforward(x)
        print("x(feedforward):",x.shape)
        
        ## predict "unnorrmalized" prediction score
        logits = self.linear(x)

        print("logits:",logits.shape)

        loss = None
        if targets is not None:
            B, T, C =logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss



###############  modelの定義　##################
number_of_heads = 4 # 同時に実行されるself-attentionの数
block_size = 8 # 一度に処理できる最大の文字数
n_mbed = 32 # トークンの埋め込むベクトルの次元数

char_size = len(train_data)

model = Model(n_mbed, char_size, block_size, number_of_heads)

In [16]:
# 次元数を2にする
encoded_words_1 = torch.tensor([encode("ホログラフィ")])
encoded_words_2 = torch.tensor([encode("メモリデータ")])
print(encoded_words_1.shape)
print(encoded_words_2.shape)


input_data = torch.cat([encoded_words_1,encoded_words_2],dim = 0)
print(input_data.shape)

pred = model(input_data)[0]
pred_view = pred.view(2*6,1063)
print(pred_view.shape)

torch.Size([1, 6])
torch.Size([1, 6])
torch.Size([2, 6])
B: 2 T: 6
token_mbed: torch.Size([2, 6, 32])
position_mbed.shape: torch.Size([6, 32])
position_mbed tensor([[-3.6685e-01,  4.0379e-01, -3.8761e-01, -3.6518e-01,  4.6495e-01,
          4.1665e-01, -1.1423e+00,  6.3757e-01, -1.0268e+00, -1.0083e+00,
         -8.2534e-01,  9.6847e-01,  1.9930e-01, -3.8970e-02, -9.6633e-01,
         -5.9635e-01, -2.4541e-01,  1.0442e-01,  6.9777e-04, -8.7560e-01,
          1.6837e+00,  2.0090e+00, -7.5250e-01,  8.1650e-01, -5.0917e-02,
         -7.9443e-01,  1.3591e+00, -1.2493e+00, -1.1036e+00,  1.5045e+00,
          1.2132e+00, -9.2199e-02],
        [-1.8783e-01,  8.6550e-02,  2.4324e-01, -1.7790e+00,  4.7128e-01,
          6.2353e-01, -4.4046e-02,  8.7623e-01,  9.1944e-01, -1.3978e+00,
         -2.7721e-01,  2.1147e-01, -7.6969e-01,  1.5363e+00, -4.0621e-01,
          7.5846e-01, -2.6122e-01,  9.1758e-02,  4.9103e-01,  3.0779e-01,
          8.2340e-01, -1.2141e+00,  1.1322e-01,  4.4794e-02,  2.170

In [14]:
batch_size = 32
a = torch.randint(len(train_data) - block_size, (batch_size,))
print(a)


# if A and B are of shape (3, 4):
A = torch.tensor([[1,2,3],[3,4,5]])
B = torch.tensor([[1,2,3],[4,5,6]])

print(torch.cat([A, B], dim=0).shape)

print(torch.stack([A, B], dim=0).shape)

tensor([883, 757, 724, 358, 558, 996, 137, 155, 938, 899, 507, 784, 543,  65,
        814, 928, 449, 710, 997, 336, 609,  42, 569, 471, 148, 807, 927, 204,
         44, 187, 679, 172])
torch.Size([4, 3])
torch.Size([2, 2, 3])


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr =1e-3)

batch_size = 32 

for steps in range(10000):
    ix = torch.randint(len(train_data) - block_size, (batch_size,))
    x = torch.stack([train_data[i : i + block_size] for i in  ix])
    y = torch.stack([train_data[i+1 : i + block_size+1] for i in  ix])
    logits, loss = model(x,y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


In [9]:
logits, loss = model(x,y)
idx = torch.zeros((1,1), dtype = torch.long)
for _ in range(50):
    idx_pred = idx[:, -block_size:]
    logits , loss = model(idx_pred)
    logits = logits[:,-1,:]
    probs = F.softmax(logits, dim=1)
    idx_next_pred = torch.multinomial(probs, num_samples=1)
    idx = torch.cat((idx, idx_next_pred),dim = 1)

predict = decode(idx[0].tolist())
print("予測結果 : ", predict)

NameError: name 'x' is not defined