In [21]:
import tensorflow as tf
from tensorflow import keras
import random

In [63]:
class AttentionLayer(tf.keras.layers.Layer):
    """
    Multi-Head Convolutional Self Attention Layer
    """
    def __init__(self, dk, dv, num_heads, filter_size):
        super().__init__()
        self.dk = dk
        self.dv = dv
        self.num_heads = num_heads
        
        self.conv_q = tf.keras.layers.Conv1D(dk * num_heads, # 
                                             filter_size, # 
                                             padding='causal',
                                             dilation_rate=1)
        self.conv_k = tf.keras.layers.Conv1D(dk * num_heads, filter_size, padding='causal',dilation_rate=1)
        self.dense_v = tf.keras.layers.Dense(dv * num_heads)
        self.dense1 = tf.keras.layers.Dense(dv, activation='relu')
        self.dense2 = tf.keras.layers.Dense(dv)
        
    def split_heads(self, x, batch_size, dim):
        x = tf.expand_dims(x,axis=1)
        x = tf.concat(tf.split(x,self.num_heads,axis=3),axis=1)
#         print("x的shape:",x.shape) # (1, 6, 23, 64)
        return x
    
    def call(self, inputs):
        batch_size, time_steps, _ = inputs.shape
        print(batch_size,time_steps)
        
        q = self.conv_q(inputs)
        k = self.conv_k(inputs)
        v = self.dense_v(inputs)
        
        q = self.split_heads(q, batch_size, self.dk)
        k = self.split_heads(k, batch_size, self.dk)
        v = self.split_heads(v, batch_size, self.dv)
        print(q.shape)
        print(k.shape)
        print(v.shape)
        
        # 
        mask = 1 - tf.linalg.band_part(tf.ones((self.num_heads, time_steps, time_steps)), -1, 0)
        print(mask.shape) # mask的第一个维度不需要设置，会自动的扩维
        
        dk = tf.cast(self.dk, tf.float32)
        print(dk.dtype,q.dtype,k.dtype)
        
        score = tf.nn.softmax(tf.matmul(q, k, transpose_b=True)/tf.math.sqrt(dk) + mask * -1e9)
        
        outputs = tf.matmul(score, v)
        
        outputs = tf.transpose(outputs, perm=[0, 2, 1, 3])
        print(outputs.shape)
#         outputs = tf.reshape(outputs, (batch_size, time_steps, -1))
        outputs = tf.squeeze(tf.concat(tf.split(outputs,self.num_heads,axis=2),axis=-1),axis=-2)
        print(outputs.shape)
        
        outputs = self.dense1(outputs)
        outputs = self.dense2(outputs)
        
        return outputs
input_layer = keras.layers.Input(shape=(23,1))
q_k_dim = 32
v_dim = 64
num_heads = 2
kernel_size = 2 # 这个应该是kernel_size
attention_layer = AttentionLayer(q_k_dim,v_dim,num_heads,kernel_size)(input_layer)

None 23
(None, 2, 23, 32)
(None, 2, 23, 32)
(None, 2, 23, 64)
(2, 23, 23)
<dtype: 'float32'> <dtype: 'float32'> <dtype: 'float32'>
(None, 23, 2, 64)
(None, 23, 128)


In [62]:
# selfAttention处理序列数据
class Attention(tf.keras.layers.Layer):
    """
    Multi-Head Convolutional Self Attention Layer
    """
    def __init__(self, dk, dv, num_heads, filter_size):
        super().__init__()
        self.dk = dk
        self.dv = dv
        self.num_heads = num_heads
        
        self.conv_q = tf.keras.layers.Conv1D(dk * num_heads, # 
                                             filter_size, # 
                                             padding='causal',
                                             dilation_rate=1)
        self.conv_k = tf.keras.layers.Conv1D(dk * num_heads, filter_size, padding='causal',dilation_rate=1)
        self.dense_v = tf.keras.layers.Dense(dv * num_heads)
        self.dense1 = tf.keras.layers.Dense(dv, activation='relu')
        self.dense2 = tf.keras.layers.Dense(dv)
        
    def split_heads(self, x, batch_size, dim):
#         x = tf.reshape(x, (batch_size, -1, self.num_heads, dim))
#         x = tf.transpose(x, perm=[0, 2, 1, 3])
#         print("x的shape:",x.shape) # (1, 6, 23, 64)
#         return x
        
        # 用这种方式才能避免eager模型下shape处理
        x = tf.expand_dims(x,axis=1)
        x = tf.concat(tf.split(x,self.num_heads,axis=3),axis=1)
#         print("x的shape:",x.shape) # (1, 6, 23, 64)
        return x
    
    def call(self, inputs):
        batch_size, time_steps, _ = inputs.shape
        print(batch_size,time_steps)
        
        q = self.conv_q(inputs)
        k = self.conv_k(inputs)
        v = self.dense_v(inputs)
        print(q.shape)
        print(k.shape)
        print(v.shape)
        q = self.split_heads(q, batch_size, self.dk)
        k = self.split_heads(k, batch_size, self.dk)
        v = self.split_heads(v, batch_size, self.dv)
        print(q.shape)
        print(k.shape)
        print(v.shape)
        
        # 
        mask = 1 - tf.linalg.band_part(tf.ones((self.num_heads, time_steps, time_steps)), -1, 0)
        print(mask.shape)
        
        dk = tf.cast(self.dk, tf.float32)
        print(dk.dtype,q.dtype,k.dtype)
        
        score = tf.nn.softmax(tf.matmul(q, k, transpose_b=True)/tf.math.sqrt(dk) + mask * -1e9)
        
        outputs = tf.matmul(score, v)
        
        outputs = tf.transpose(outputs, perm=[0, 2, 1, 3])
        print(outputs.shape)
        outputs = tf.squeeze(tf.concat(tf.split(outputs,self.num_heads,axis=2),axis=-1),axis=-2)
        print(outputs.shape)
#         outputs = tf.reshape(outputs, (batch_size, time_steps, -1))
#         print(outputs.shape)
        
        outputs = self.dense1(outputs)
        outputs = self.dense2(outputs)
        
        return outputs
tensor = tf.constant([random.randint(0,20) for _ in range(23)],shape=[1,23,1],dtype=tf.float32)
q_k_dim = 64
v_dim = 64
num_heads = 6
kernel_size = 2 # 这个应该是kernel_size
attention = Attention(q_k_dim,v_dim,num_heads,kernel_size)
attention(tensor)

1 23
(1, 23, 384)
(1, 23, 384)
(1, 23, 384)
(1, 6, 23, 64)
(1, 6, 23, 64)
(1, 6, 23, 64)
(6, 23, 23)
<dtype: 'float32'> <dtype: 'float32'> <dtype: 'float32'>
(1, 23, 6, 64)
(1, 23, 384)


<tf.Tensor: shape=(1, 23, 64), dtype=float32, numpy=
array([[[ 0.11008102,  0.07459726, -0.08492145, ..., -0.11901394,
          0.08982512,  0.06120256],
        [ 0.23853333,  0.1825556 , -0.19826752, ..., -0.2719772 ,
          0.22004394,  0.15924905],
        [ 0.25555617,  0.19842339, -0.22079235, ..., -0.30405885,
          0.2452208 ,  0.17303891],
        ...,
        [ 0.276011  ,  0.14434591, -0.35215068, ..., -0.589652  ,
          0.42293638,  0.66774774],
        [ 0.4504622 ,  0.35425574, -0.4111826 , ..., -0.57004464,
          0.46835276,  0.5105071 ],
        [ 0.43305025,  0.34583712, -0.41320977, ..., -0.5793208 ,
          0.474707  ,  0.5581056 ]]], dtype=float32)>

In [25]:
# LSTM 处理序列数据
import random
tensor = tf.constant([random.randint(0,20) for _ in range(23)],shape=[1,23,1],dtype=tf.float32)
lstm = tf.keras.layers.LSTM(64, return_sequences=True, return_state=True)
whole_seq_output, final_memory_state, final_carry_state = lstm(tensor)
whole_seq_output

<tf.Tensor: shape=(1, 23, 64), dtype=float32, numpy=
array([[[ 0.09600929,  0.17820291,  0.2513949 , ...,  0.33010876,
         -0.1308659 , -0.426734  ],
        [ 0.13310671,  0.27456215,  0.36580214, ...,  0.4285784 ,
         -0.2159098 , -0.69556177],
        [ 0.2550217 ,  0.34249434,  0.37278575, ...,  0.41329986,
         -0.18618973, -0.6584412 ],
        ...,
        [ 0.28866166,  0.5017709 ,  0.44298515, ...,  0.44206208,
         -0.14011145, -0.77515763],
        [ 0.47553673,  0.2951322 ,  0.30566636, ...,  0.36804664,
         -0.03105145, -0.47216243],
        [ 0.2822839 ,  0.47943288,  0.38585803, ...,  0.42223808,
         -0.07439169, -0.72712535]]], dtype=float32)>