In [1]:
# ! -*- coding: utf-8 -*-
# from keras.engine.topology import Layer
from tensorflow.keras.layers import Layer 
import tensorflow as tf

class BatchSlice(Layer):
    def __init__(self, dim=2, **kwargs):
        self.dim = dim
        super(BatchSlice, self).__init__(**kwargs)
    '''
    build(input_shape): 这是你定义权重的地方。这个方法必须设 self.built = True，可以通过调用 super([Layer], self).build() 完成
    '''
    def build(self, input_shape):
        super(BatchSlice, self).build(input_shape)
    '''
    call(x): 这里是编写层的功能逻辑的地方。你只需要关注传入 call 的第一个参数：输入张量，除非你希望你的层支持masking。
    '''
    def call(self, x, mask=None):
        x, length = x # [bs, len, dim]
        length = tf.cast(tf.reduce_max(length), tf.int32)
        st = [0] * self.dim
        ed = [-1] * self.dim
        ed[1] = length
        x = tf.slice(x, st, ed)

        return x

In [2]:
from tensorflow.keras.layers import Layer
from tensorflow.keras.initializers import VarianceScaling
from tensorflow.keras.regularizers import *
import tensorflow as tf
from tensorflow.keras import backend as K


class context2query_attention(Layer):

    def __init__(self, output_dim, c_maxlen, q_maxlen, dropout, **kwargs):
        self.output_dim = output_dim
        self.c_maxlen = c_maxlen
        self.q_maxlen = q_maxlen
        self.dropout = dropout
        super(context2query_attention, self).__init__(**kwargs)

    '''
    build(input_shape): 这是你定义权重的地方。这个方法必须设 self.built = True，可以通过调用 super([Layer], self).build() 完成
    '''

    def build(self, input_shape):
        # input_shape: [(None, ?, 128), (None, ?, 128)]
        init = VarianceScaling(scale=1.0, mode='fan_avg', distribution='uniform')
        self.W0 = self.add_weight(name='W0',
                                  shape=(input_shape[0][-1], 1),
                                  initializer=init,
                                  regularizer=l2(3e-7),
                                  trainable=True)
        self.W1 = self.add_weight(name='W1',
                                  shape=(input_shape[1][-1], 1),
                                  initializer=init,
                                  regularizer=l2(3e-7),
                                  trainable=True)
        self.W2 = self.add_weight(name='W2',
                                  shape=(1, 1, input_shape[0][-1]),
                                  initializer=init,
                                  regularizer=l2(3e-7),
                                  trainable=True)
        self.bias = self.add_weight(name='linear_bias',
                                    shape=([1]),
                                    initializer='zero',
                                    regularizer=l2(3e-7),
                                    trainable=True)
        super(context2query_attention, self).build(input_shape)

    def mask_logits(self, inputs, mask, mask_value=-1e30):
        mask = tf.cast(mask, tf.float32)
        return inputs + mask_value * (1 - mask)
    
    '''
    call(x): 这里是编写层的功能逻辑的地方。你只需要关注传入 call 的第一个参数：输入张量，除非你希望你的层支持masking。
    '''


    def call(self, x, mask=None):
        x_cont, x_ques, c_mask, q_mask = x
        # get similarity matrix S
        subres0 = K.tile(K.dot(x_cont, self.W0), [1, 1, self.q_maxlen])
        subres1 = K.tile(K.permute_dimensions(K.dot(x_ques, self.W1), pattern=(0, 2, 1)), [1, self.c_maxlen, 1])
        subres2 = K.batch_dot(x_cont * self.W2, K.permute_dimensions(x_ques, pattern=(0, 2, 1)))
        S = subres0 + subres1 + subres2
        S += self.bias
        q_mask = tf.expand_dims(q_mask, 1)
        S_ = tf.nn.softmax(self.mask_logits(S, q_mask))
        c_mask = tf.expand_dims(c_mask, 2)
        S_T = K.permute_dimensions(tf.nn.softmax(self.mask_logits(S, c_mask), axis=1), (0, 2, 1))
        c2q = tf.matmul(S_, x_ques)
        q2c = tf.matmul(tf.matmul(S_, S_T), x_cont)
        result = K.concatenate([x_cont, c2q, x_cont * c2q, x_cont * q2c], axis=-1)
        #result Tensor("context2query_attention/concat_2:0", shape=(None, None, 512), dtype=float32)
        print('result',result)
        return result

    '''
    compute_output_shape(input_shape): 如果你的层更改了输入张量的形状，你应该在这里定义形状变化的逻辑，这让Keras能够自动推断各层的形状
    '''
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)


In [3]:
from tensorflow.keras.layers import Layer
from tensorflow.keras.initializers import VarianceScaling
from tensorflow.keras.regularizers import *
import tensorflow as tf
from tensorflow.keras import backend as K


class DepthwiseConv1D(Layer):

    def __init__(self, kernel_size, filter, **kwargs):
        self.kernel_size = kernel_size
        self.filter = filter
        super(DepthwiseConv1D, self).__init__(**kwargs)

    def build(self, input_shape):
        init_relu = VarianceScaling(scale=2.0, mode='fan_in', distribution='normal')
        self.depthwise_w = self.add_weight("depthwise_filter",
                                           shape=(self.kernel_size, 1, input_shape[-1], 1),
                                           initializer=init_relu,
                                           regularizer=l2(3e-7),
                                           trainable=True)
        self.pointwise_w = self.add_weight("pointwise_filter",
                                           (1, 1, input_shape[-1], self.filter),
                                           initializer=init_relu,
                                           regularizer=l2(3e-7),
                                           trainable=True)
        self.bias = self.add_weight("bias",
                                    input_shape[-1],
                                    regularizer=l2(3e-7),
                                    initializer=tf.zeros_initializer())
        super(DepthwiseConv1D, self).build(input_shape)

    def call(self, x, mask=None):
        x = K.expand_dims(x, axis=2)
        x = tf.nn.separable_conv2d(x,
                                   self.depthwise_w,
                                   self.pointwise_w,
                                   strides=(1, 1, 1, 1),
                                   padding="SAME")
        x += self.bias
        x = K.relu(x)
        outputs = K.squeeze(x, axis=2)

        return outputs

    def compute_output_shape(self, input_shape):
        return input_shape


In [4]:
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.training import slot_creator
import tensorflow.keras.backend as K
# from tensorflow.keras.backend import moving_averages
from tensorflow.keras.backend import moving_average_update
from tqdm import tqdm


class ExponentialMovingAverage():
    def __init__(self, model, decay, weights_list=None, temp_model='temp_model.h5',
                 name='ExponentialMovingAverage', type='cpu'):
        # EMA for keras, the example can be seen in https://github.com/ewrfcas/QANet_keras/blob/master/train_QANet.py
        # init before training, but after the model init.
        self.model = model
        self.scope_name = name
        self.temp_model = temp_model
        self.type = type
        self.decay = decay
        self._averages = {}

        if weights_list is None:
            weights_list = self.model.trainable_weights

        if self.type == 'gpu':
            self.sess = K.get_session()
            for weight in weights_list:
                if weight.dtype.base_dtype not in [tf.float16, tf.float32,
                                                   tf.float64]:
                    raise TypeError("The variables must be half, float, or double: %s" %
                                    weight.name)
                if weight in self._averages:
                    raise ValueError("Moving average already computed for: %s" % weight.name)

                # For variables: to lower communication bandwidth across devices we keep
                # the moving averages on the same device as the variables. For other
                # tensors, we rely on the existing device allocation mechanism.
                with ops.init_scope():
                    if isinstance(weight, tf.Variable):
                        avg = slot_creator.create_slot(weight,
                                                       weight.initialized_value(),
                                                       self.scope_name,
                                                       colocate_with_primary=True)
                        # NOTE(mrry): We only add `tf.Variable` objects to the
                        # `MOVING_AVERAGE_VARIABLES` collection.
                        ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, weight)
                    else:
                        avg = slot_creator.create_zeros_slot(weight,
                                                             self.scope_name,
                                                             colocate_with_primary=(weight.op.type in ["Variable",
                                                                                                       "VariableV2",
                                                                                                       "VarHandleOp"]))
                self._averages[weight] = avg

            with tf.name_scope(self.scope_name):
                decay = ops.convert_to_tensor(decay, name="decay")
                self.updates = []
                for var in weights_list:
                    self.updates.append(
                        moving_averages.assign_moving_average(self._averages[var], var, decay, zero_debias=False))

                self.assigns = []
                for weight in weights_list:
                    self.assigns.append(tf.assign(weight, self._averages[weight]))

            self.sess.run(tf.global_variables_initializer())

        elif self.type == 'cpu':
            print('CPU EMA getting weights...')
            for weight in tqdm(weights_list):
                self._averages[weight.name] = K.get_value(weight)

    def average_update(self):
        # run in the end of each batch
        if self.type == 'gpu':
            self.sess.run(self.updates)
        elif self.type == 'cpu':
            for weight in self.model.trainable_weights:
                old_val = self._averages[weight.name]
                self._averages[weight.name] = self.decay * old_val + (1.0 - self.decay) * K.get_value(weight)

    def assign_shadow_weights(self, backup=True):
        # run while you need to assign shadow weights (at end of each epoch or the total training)
        if backup:
            self.model.save_weights(self.temp_model)

        if self.type == 'gpu':
            self.sess.run(self.assigns)
        elif self.type == 'cpu':
            print('CPU EMA assigning weights...')
            for weight in tqdm(self.model.trainable_weights):
                K.set_value(weight, self._averages[weight.name])


In [5]:
# ! -*- coding: utf-8 -*-
from tensorflow.keras.layers import Layer
import tensorflow as tf

def shape_list(x):
  """Return list of dims, statically where possible."""
  x = tf.convert_to_tensor(x)

  # If unknown rank, return dynamic shape
  if x.get_shape().dims is None:
    return tf.shape(x)

  static = x.get_shape().as_list()
  shape = tf.shape(x)

  ret = []
  for i in range(len(static)):
    dim = static[i]
    if dim is None:
      dim = shape[i]
    ret.append(dim)
  return ret

class LabelPadding(Layer):
    def __init__(self, max_len, **kwargs):
        self.max_len = max_len
        super(LabelPadding, self).__init__(**kwargs)

    def build(self, input_shape):
        super(LabelPadding, self).build(input_shape)

    def call(self, x, mask=None, training=None):
        tensor_shape = shape_list(x) # [bs, len]
        zero_paddings = tf.zeros((tensor_shape[0], self.max_len - tensor_shape[1]))
        x = tf.concat([x, zero_paddings], axis=-1)
        return x

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.max_len)

In [6]:
# ! -*- coding: utf-8 -*-
from tensorflow.keras.layers import Layer
import tensorflow as tf
import tensorflow.keras.backend as K

class LayerDropout(Layer):
    def __init__(self, dropout = 0.1, **kwargs):
        self.dropout = dropout
        super(LayerDropout, self).__init__(**kwargs)

    def build(self, input_shape):
        super(LayerDropout, self).build(input_shape)
    #0.2 should be self.dropout，但这里总是出错，就先改成0.2
    def call(self, x, mask=None, training=None):
        x, residual = x
        pred = tf.random.uniform([]) < self.dropout
        #print('self.dropout',self.dropout)
        x_train = tf.cond(pred, lambda: residual, lambda: tf.nn.dropout(x, 1.0 -0.2 ) + residual)
        x_test = x + residual
        return K.in_train_phase(x_train, x_test, training=training)

    def compute_output_shape(self, input_shape):
        return input_shape

In [7]:
# ! -*- coding: utf-8 -*-
from tensorflow.keras.layers import Layer
import tensorflow as tf
import tensorflow.keras.backend as K

class LayerNormalization(Layer):
    def __init__(self, **kwargs):
        super(LayerNormalization, self).__init__(**kwargs)

    def build(self, input_shape):
        self.scale = self.add_weight(name='layer_norm_scale',
                                    shape=(input_shape[-1]),
                                    initializer=tf.ones_initializer(),
                                    trainable=True)
        self.bias = self.add_weight(name='layer_norm_bias',
                                    shape=(input_shape[-1]),
                                    initializer=tf.zeros_initializer(),
                                    trainable=True)
        super(LayerNormalization, self).build(input_shape)

    def call(self, x, mask=None, training=None):
        mean = tf.reduce_mean(x, axis=-1, keepdims=True)
        variance = tf.reduce_mean(tf.square(x - mean), axis=-1, keepdims=True)
        norm_x = (x - mean) * tf.math.rsqrt(variance + K.epsilon())
        return norm_x * self.scale + self.bias

    def compute_output_shape(self, input_shape):
        return input_shape

In [8]:
# ! -*- coding: utf-8 -*-

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer
import tensorflow as tf


class MultiHeadAttention(Layer):
    def __init__(self, units, num_heads, dropout=0.1, bias=True, **kwargs):
        self.units = units
        self.num_heads = num_heads
        self.dropout = dropout
        self.bias = bias
        super(MultiHeadAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        if self.bias:
            self.b = self.add_weight(name='bias',
                                     shape=([1]),
                                     initializer='zero')
        super(MultiHeadAttention, self).build(input_shape)

    def split_last_dimension(self, x, n):
        old_shape = x.get_shape().dims
        last = old_shape[-1]
        new_shape = old_shape[:-1] + [n] + [last // n if last else None]
        ret = tf.reshape(x, tf.concat([tf.shape(x)[:-1], [n, -1]], 0))
        ret.set_shape(new_shape)
        return tf.transpose(ret, [0, 2, 1, 3])

    def mask_logits(self, inputs, mask, mask_value=-1e30):
        mask = tf.cast(mask, tf.float32)
        return inputs + mask_value * (1 - mask)

    def dot_product_attention(self, x, mask=None, dropout=0.1, training=None):
        q, k, v = x
        logits = tf.matmul(q, k, transpose_b=True)  # [bs, 8, len, len]
        if self.bias:
            logits += self.b
        if mask is not None:  # [bs, len]
            mask = tf.expand_dims(mask, axis=1)
            mask = tf.expand_dims(mask, axis=1)  # [bs,1,1,len]
            logits = self.mask_logits(logits, mask)
        weights = tf.nn.softmax(logits, name="attention_weights")
        weights = K.in_train_phase(K.dropout(weights, dropout), weights, training=training)
        x = tf.matmul(weights, v)
        return x

    def combine_last_two_dimensions(self, x):
        old_shape = x.get_shape().dims
        a, b = old_shape[-2:]
        new_shape = old_shape[:-2] + [a * b if a and b else None]
        ret = tf.reshape(x, tf.concat([tf.shape(x)[:-2], [-1]], 0))
        ret.set_shape(new_shape)
        return ret

    def call(self, x, mask=None, training=None):
        memory, query, seq_mask = x
        Q = self.split_last_dimension(query, self.num_heads)
        memory = tf.split(memory, 2, axis=2)
        K = self.split_last_dimension(memory[0], self.num_heads)
        V = self.split_last_dimension(memory[1], self.num_heads)

        key_depth_per_head = self.units // self.num_heads
        Q *= (key_depth_per_head ** -0.5)
        x = self.dot_product_attention([Q, K, V], seq_mask, dropout=self.dropout, training=training)
        x = self.combine_last_two_dimensions(tf.transpose(x, [0, 2, 1, 3]))

        return x

    def compute_output_shape(self, input_shape):
        return input_shape[1]


In [9]:
MultiHeadAttention

__main__.MultiHeadAttention

In [10]:
from tensorflow.keras.layers import Layer
import tensorflow as tf
import math


class Position_Embedding(Layer):
    def __init__(self, min_timescale=1.0, max_timescale=1.0e4, **kwargs):
        self.min_timescale = min_timescale
        self.max_timescale = max_timescale
        super(Position_Embedding, self).__init__(**kwargs)

    def get_timing_signal_1d(self, length, channels):
        position=tf.cast(tf.range(length),dtype=tf.float32)
        num_timescales = channels // 2
        log_timescale_increment = (math.log(float(self.max_timescale) / float(self.min_timescale)) / (tf.cast(num_timescales,dtype=tf.float32) - 1))
        inv_timescales = self.min_timescale * tf.exp(tf.cast(tf.range(num_timescales),dtype=tf.float32) * -log_timescale_increment)
        scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
        signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
        signal = tf.pad(signal, [[0, 0], [0, tf.math.mod(channels, 2)]])
        signal = tf.reshape(signal, [1, length, channels])
        return signal

    def add_timing_signal_1d(self, x):
        length = tf.shape(x)[1]
        channels = tf.shape(x)[2]
        signal = self.get_timing_signal_1d(length, channels)
        return x + signal

    def call(self, x, mask=None):
        return self.add_timing_signal_1d(x)

    def compute_output_shape(self, input_shape):
        return input_shape

In [11]:
PositionEmbedding=Position_Embedding
PositionEmbedding

__main__.Position_Embedding

In [12]:
# ! -*- coding: utf-8 -*-
from tensorflow.keras.layers import Layer
from tensorflow.keras.regularizers import *
import tensorflow as tf
import tensorflow.keras.backend as K

class QAoutputBlock(Layer):
    def __init__(self, ans_limit=30, **kwargs):
        self.ans_limit = ans_limit
        super(QAoutputBlock, self).__init__(**kwargs)

    def build(self, input_shape):
        super(QAoutputBlock, self).build(input_shape)

    def call(self, x, mask=None):
        x1 ,x2 = x
        outer = tf.matmul(tf.expand_dims(x1, axis=2), tf.expand_dims(x2, axis=1))
        outer = tf.linalg.band_part(outer, 0, self.ans_limit)
        output1 = tf.reshape(tf.cast(tf.argmax(tf.reduce_max(outer, axis=2), axis=1), tf.float32),(-1,1))
        output2 = tf.reshape(tf.cast(tf.argmax(tf.reduce_max(outer, axis=1), axis=1), tf.float32),(-1,1))

        return [output1, output2]

    def compute_output_shape(self, input_shape):
        return [(input_shape[0][0],1), (input_shape[0][0],1)]

In [13]:
from tensorflow.keras.layers import *
from tensorflow.keras.regularizers import *
from tensorflow.keras.models import *
# from layers.context2query_attention import context2query_attention

# from layers.multihead_attention import Attention as MultiHeadAttention
# from layers.position_embedding import Position_Embedding as PositionEmbedding
# from layers.layer_norm import LayerNormalization
# from layers.layer_dropout import LayerDropout
# from layers.QAoutputBlock import QAoutputBlock
# from layers.BatchSlice import BatchSlice
# from layers.DepthwiseConv1D import DepthwiseConv1D
# from layers.LabelPadding import LabelPadding
from tensorflow.keras.initializers import VarianceScaling
import tensorflow as tf
import tensorflow.keras.backend as K

regularizer = l2(3e-7)
init = VarianceScaling(scale=1.0, mode='fan_avg', distribution='uniform')
init_relu = VarianceScaling(scale=2.0, mode='fan_in', distribution='normal')


def mask_logits(inputs, mask, mask_value=-1e30):
    mask = tf.cast(mask, tf.float32)
    return inputs + mask_value * (1 - mask)


def highway(highway_layers, x, num_layers=2, dropout=0.1):
    # reduce dim
    x = highway_layers[0](x)
    for i in range(num_layers):
        T = highway_layers[i * 2 + 1](x)
        H = highway_layers[i * 2 + 2](x)
        H = Dropout(dropout)(H)
        x = Lambda(lambda v: v[0] * v[1] + v[2] * (1 - v[1]))([H, T, x])
    return x


def conv_block(conv_layers, x, num_conv=4, dropout=0.1, l=1., L=1.):
    for i in range(num_conv):
        residual = x
        x = LayerNormalization()(x)
        if i % 2 == 0:
            x = Dropout(dropout)(x)
        x = conv_layers[i](x)
        x = LayerDropout(dropout * (l / L))([x, residual])
    return x


def attention_block(attention_layer, x, seq_mask, dropout=0.1, l=1., L=1.):
    residual = x
    x = LayerNormalization()(x)
    x = Dropout(dropout)(x)
    x1 = attention_layer[0](x)
    x2 = attention_layer[1](x)
    x = attention_layer[2]([x1, x2, seq_mask])
    x = LayerDropout(dropout * (l / L))([x, residual])
    return x


def feed_forward_block(FeedForward_layers, x, dropout=0.1, l=1., L=1.):
    residual = x
    x = LayerNormalization()(x)
    x = Dropout(dropout)(x)
    x = FeedForward_layers[0](x)
    x = FeedForward_layers[1](x)
    x = LayerDropout(dropout * (l / L))([x, residual])
    return x


def QANet(config, word_mat=None, char_mat=None, cove_model=None):
    # parameters
    word_dim = config['word_dim']
    char_dim = config['char_dim']
    cont_limit = config['cont_limit']
    char_limit = config['char_limit']
    ans_limit = config['ans_limit']
    filters = config['filters']
    num_head = config['num_head']
    dropout = config['dropout']

    # Input Embedding Layer
    #`Input()` is used to instantiate a Keras tensor.S
    contw_input_ = Input((None,))
    quesw_input_ = Input((None,))
    contc_input_ = Input((None, char_limit))
    quesc_input_ = Input((None, char_limit))

    # get mask
    c_mask = Lambda(lambda x: tf.cast(x, tf.bool))(contw_input_)  # [bs, c_len]
    q_mask = Lambda(lambda x: tf.cast(x, tf.bool))(quesw_input_)
    cont_len = Lambda(lambda x: tf.expand_dims(tf.reduce_sum(tf.cast(x, tf.int32), axis=1), axis=1))(c_mask)
    ques_len = Lambda(lambda x: tf.expand_dims(tf.reduce_sum(tf.cast(x, tf.int32), axis=1), axis=1))(q_mask)

    # slice
    contw_input = BatchSlice(dim=2)([contw_input_, cont_len])
    quesw_input = BatchSlice(dim=2)([quesw_input_, ques_len])
    contc_input = BatchSlice(dim=3)([contc_input_, cont_len])
    quesc_input = BatchSlice(dim=3)([quesc_input_, ques_len])
    c_mask = BatchSlice(dim=2)([c_mask, cont_len])
    q_mask = BatchSlice(dim=2)([q_mask, ques_len])
    c_maxlen = tf.cast(tf.reduce_max(cont_len), tf.int32)
    q_maxlen = tf.cast(tf.reduce_max(ques_len), tf.int32)

    # embedding word
    WordEmbedding = Embedding(word_mat.shape[0], word_dim, weights=[word_mat], trainable=False, name='word_embedding')
    xw_cont = WordEmbedding(contw_input)
    xw_ques = WordEmbedding(quesw_input)

    # cove
    if cove_model is not None:
        x_cont_cove = cove_model(xw_cont)
        x_ques_cove = cove_model(xw_ques)
        xw_cont = Concatenate()([xw_cont, x_cont_cove])
        xw_ques = Concatenate()([xw_ques, x_ques_cove])

    # embedding char
    CharEmbedding = Embedding(char_mat.shape[0], char_dim, weights=[char_mat], name='char_embedding')
    xc_cont = CharEmbedding(contc_input)
    xc_ques = CharEmbedding(quesc_input)
    char_conv = Conv1D(filters, 5,
                       activation='relu',
                       kernel_initializer=init_relu,
                       kernel_regularizer=regularizer,
                       name='char_conv')
    xc_cont = Lambda(lambda x: tf.reshape(x, (-1, char_limit, char_dim)))(xc_cont)
    xc_ques = Lambda(lambda x: tf.reshape(x, (-1, char_limit, char_dim)))(xc_ques)
    xc_cont = char_conv(xc_cont)
    xc_ques = char_conv(xc_ques)
    xc_cont = GlobalMaxPooling1D()(xc_cont)
    xc_ques = GlobalMaxPooling1D()(xc_ques)
    xc_cont = Lambda(lambda x: tf.reshape(x, (-1, c_maxlen, filters)))(xc_cont)
    xc_ques = Lambda(lambda x: tf.reshape(x, (-1, q_maxlen, filters)))(xc_ques)

    # highwayNet
    x_cont = Concatenate()([xw_cont, xc_cont])
    x_ques = Concatenate()([xw_ques, xc_ques])

    # highway shared layers
    highway_layers = [Conv1D(filters, 1,
                             kernel_initializer=init,
                             kernel_regularizer=regularizer,
                             name='highway_input_projection')]
    for i in range(2):
        highway_layers.append(Conv1D(filters, 1,
                                     kernel_initializer=init,
                                     kernel_regularizer=regularizer,
                                     activation='sigmoid',
                                     name='highway' + str(i) + '_gate'))
        highway_layers.append(Conv1D(filters, 1,
                                     kernel_initializer=init,
                                     kernel_regularizer=regularizer,
                                     activation='linear',
                                     name='highway' + str(i) + '_linear'))
    x_cont = highway(highway_layers, x_cont, num_layers=2, dropout=dropout)
    x_ques = highway(highway_layers, x_ques, num_layers=2, dropout=dropout)

    # build shared layers
    # shared convs
    Encoder_DepthwiseConv1 = []
    for i in range(4):
        Encoder_DepthwiseConv1.append(DepthwiseConv1D(7, filters))

    # shared attention
    Encoder_SelfAttention1 = [Conv1D(2 * filters, 1,
                                     kernel_initializer=init,
                                     kernel_regularizer=regularizer),
                              Conv1D(filters, 1,
                                     kernel_initializer=init,
                                     kernel_regularizer=regularizer),
                              MultiHeadAttention(filters, num_head, dropout=dropout, bias=False)]
    # shared feed-forward
    Encoder_FeedForward1 = []
    Encoder_FeedForward1.append(Conv1D(filters, 1,
                                       kernel_initializer=init,
                                       kernel_regularizer=regularizer,
                                       activation='relu'))
    Encoder_FeedForward1.append(Conv1D(filters, 1,
                                       kernel_initializer=init,
                                       kernel_regularizer=regularizer,
                                       activation='linear'))

    # Context Embedding Encoder Layer
    x_cont = PositionEmbedding()(x_cont)
    x_cont = conv_block(Encoder_DepthwiseConv1, x_cont, 4, dropout)
    x_cont = attention_block(Encoder_SelfAttention1, x_cont, c_mask, dropout)
    x_cont = feed_forward_block(Encoder_FeedForward1, x_cont, dropout)

    # Question Embedding Encoder Layer
    x_ques = PositionEmbedding()(x_ques)
    x_ques = conv_block(Encoder_DepthwiseConv1, x_ques, 4, dropout)
    x_ques = attention_block(Encoder_SelfAttention1, x_ques, q_mask, dropout)
    x_ques = feed_forward_block(Encoder_FeedForward1, x_ques, dropout)
    
    print('x_cont={}\n  x_ques={}\n  c_mask={}\n  q_mask={}\n'.format(x_cont, x_ques, c_mask, q_mask))

    # Context_to_Query_Attention_Layer
    ##512, c_maxlen, q_maxlen, dropout初始化该层的类，输入为[x_cont, x_ques, c_mask, q_mask]
    x = context2query_attention(512, c_maxlen, q_maxlen, dropout)([x_cont, x_ques, c_mask, q_mask])
    
    print('Context_to_Query_Attention_Layer x',x)
    x = Conv1D(filters, 1,
               kernel_initializer=init,
               kernel_regularizer=regularizer,
               activation='linear')(x)

    print('conv1d x',x)
    # Model_Encoder_Layer
    # shared layers
    Encoder_DepthwiseConv2 = []
    Encoder_SelfAttention2 = []
    Encoder_FeedForward2 = []
    for i in range(7):
        DepthwiseConv_share_2_temp = []
        for i in range(2):
            DepthwiseConv_share_2_temp.append(DepthwiseConv1D(5, filters))

        Encoder_DepthwiseConv2.append(DepthwiseConv_share_2_temp)
        Encoder_SelfAttention2.append([Conv1D(2 * filters, 1,
                                              kernel_initializer=init,
                                              kernel_regularizer=regularizer),
                                       Conv1D(filters, 1,
                                              kernel_initializer=init,
                                              kernel_regularizer=regularizer),
                                       MultiHeadAttention(filters, num_head, dropout=dropout, bias=False)])
        Encoder_FeedForward2.append([Conv1D(filters, 1,
                                            kernel_initializer=init,
                                            kernel_regularizer=regularizer,
                                            activation='relu'),
                                     Conv1D(filters, 1,
                                            kernel_initializer=init,
                                            kernel_regularizer=regularizer,
                                            activation='linear')])

    outputs = [x]
    for i in range(3):
        x = outputs[-1]
        for j in range(7):
            x = PositionEmbedding()(x)
            x = conv_block(Encoder_DepthwiseConv2[j], x, 2, dropout, l=j, L=7)
            x = attention_block(Encoder_SelfAttention2[j], x, c_mask, dropout, l=j, L=7)
            x = feed_forward_block(Encoder_FeedForward2[j], x, dropout, l=j, L=7)
        outputs.append(x)
     
    print('outputs',outputs)
    # Output_Layer
    x_start = Concatenate()([outputs[1], outputs[2]])
    print('output_layer x_start',x_start)
    '''
    keras.layers.Conv1D(filters, kernel_size, strides=1, padding='valid', data_format='channels_last', dilation_rate=1, activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None)
    
    Input shape:
      3D tensor with shape: `(batch_size, time_steps, input_dim)`

  Output shape:
      3D tensor with shape: `(batch_size, new_steps, filters)`
      `steps` value might have changed due to padding or strides.

这也可以解释，为什么在Keras中使用Conv1D可以进行自然语言处理，因为在自然语言处理中，我们假设一个序列是600个单词，每个单词的词向量是300维，那么一个序列输入到网络中就是（600,300），当我使用Conv1D进行卷积的时候，实际上就完成了直接在序列上的卷积，卷积的时候实际是以（3,300）进行卷积，又因为每一行都是一个词向量，因此使用Conv1D（kernel_size=3）也就相当于使用神经网络进行了n_gram=3的特征提取了。这也是为什么使用卷积神经网络处理文本会非常快速有效的内涵。

Conv1D（kernel_size=3）实际就是Conv2D（kernel_size=（3,300）），当然必须把输入也reshape成（600,300,1），即可在多行上进行Conv2D卷积。
所以这里的kernel_size=1，是conv2d的（1，词向量维度）



    '''
    x_start = Conv1D(1, 1,
                     kernel_initializer=init,
                     kernel_regularizer=regularizer,
                     activation='linear')(x_start)
    print('conv1D x_start',x_start)
    
    #从tensor中删除所有大小是1的维度
    x_start = Lambda(lambda x: tf.squeeze(x, axis=-1))(x_start)
    print('squeeze x_start',x_start)
    
    
    ## mask_logits输出维度与输入维度一样
    x_start = Lambda(lambda x: mask_logits(x[0], x[1]))([x_start, c_mask])
    print('mask_logits x_start',x_start)
    
    ##输出的x_start是已经经过了softmax计算之后的值
    
    '''
    softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
    Returns:
    A `Tensor`. Has the same type and shape as `logits`.
    '''
    x_start = Lambda(lambda x: K.softmax(x), name='start')(x_start)  # [bs, len]
    print('x_start softmax',x_start,)

    x_end = Concatenate()([outputs[1], outputs[3]])
    x_end = Conv1D(1, 1,
                   kernel_initializer=init,
                   kernel_regularizer=regularizer,
                   activation='linear')(x_end)
    x_end = Lambda(lambda x: tf.squeeze(x, axis=-1))(x_end)
    x_end = Lambda(lambda x: mask_logits(x[0], x[1]))([x_end, c_mask])
    x_end = Lambda(lambda x: K.softmax(x), name='end')(x_end)  # [bs, len]

    x_start_fin, x_end_fin = QAoutputBlock(ans_limit, name='qa_output')([x_start, x_end])

    # if use model.fit, the output shape must be padded to the max length
    x_start = LabelPadding(cont_limit, name='start_pos')(x_start)
    x_end = LabelPadding(cont_limit, name='end_pos')(x_end)
    print('x_start  x_start_fin x_end x_end_fin ',x_start,x_start_fin,x_end,x_end_fin)
    return Model(inputs=[contw_input_, quesw_input_, contc_input_, quesc_input_],
                 outputs=[x_start, x_end, x_start_fin, x_end_fin])


In [14]:
from tensorflow.keras.optimizers import *
from tensorflow.keras.initializers import *
# from QANet_keras import QANet
import numpy as np
import tensorflow as tf

#关闭eager模式
tf.compat.v1.disable_eager_execution()

tf.keras.backend.set_learning_phase(1)  # training

#就代表生成10000行 300列的浮点数，浮点数都是从0-1中随机。
##模拟的是预训练的词向量，生成QAnet模型，然后训练时对于某个词，该词存在该向量中，就取出该词向量进行使用，不存在就使用随机的词向量
##所以模型中一定有embedding模块

embedding_matrix = np.random.random((10000, 300))
embedding_matrix_char = np.random.random((1233, 64))
config = {
    'word_dim': 300,
    'char_dim': 64,
    'cont_limit': 400,
    'ques_limit': 50,
    'char_limit': 16,
    'ans_limit': 30,
    'char_input_size': 1233,
    'filters': 128,
    'num_head': 8,
    'dropout': 0.5,
    'batch_size': 16,
    'epoch': 25,
    'ema_decay': 0.9999,
    'learning_rate': 1e-3,
    'path': 'QA001',
    'use_cove': True
}
model = QANet(config, word_mat=embedding_matrix, char_mat=embedding_matrix_char)
model.summary()

optimizer = Adam(lr=0.001, beta_1=0.8, beta_2=0.999, epsilon=1e-7)

##损失函数有4个应该是模型有4个输出，对应4个label,计算每个输出的损失函数，加权求和后最为最终的损失函数，权重即为loss_weights
model.compile(optimizer=optimizer, loss=['categorical_crossentropy', 'categorical_crossentropy', 'mae', 'mae'],
              loss_weights=[1, 1, 0, 0])

# load data
char_dim = 200
cont_limit = 400
ques_limit = 50
char_limit = 16
#生成维度为（300，cont_limit）,大小在0-10000之间的随机整数##上下文长度最大400个词，每个词的维度是300d（感觉不对，应该是有300个上下文）
context_word = np.random.randint(0, 10000, (300, cont_limit))
question_word = np.random.randint(0, 10000, (300, ques_limit))

##最多400个词，每个词最多16个字符，字符维度也是300维度
context_char = np.random.randint(0, 96, (300, cont_limit, char_limit))
question_char = np.random.randint(0, 96, (300, ques_limit, char_limit))

start_label = np.random.randint(0, 2, (300, cont_limit))
end_label = np.random.randint(0, 2, (300, cont_limit))
start_label_fin = np.argmax(start_label, axis=-1)
end_label_fin = np.argmax(end_label, axis=-1)
'''
fit(x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, 
validation_split=0.0, validation_data=None, shuffle=True, class_weight=None, 
sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1)

即
x=[context_word, question_word, context_char, question_char],
y=[start_label, end_label, start_label_fin, end_label_fin]

Model(inputs=[contw_input_, quesw_input_, contc_input_, quesc_input_],
                 outputs=[x_start, x_end, x_start_fin, x_end_fin])
                 
Model根据输入经过网络得到输出，输出和对应的label求出损失函数，损失函数加权后作为最终的损失函数，优化器使得最终的损失函数最小
'''
model.fit([context_word, question_word, context_char, question_char],
          [start_label, end_label, start_label_fin, end_label_fin], batch_size=8)


W1012 07:36:02.857562 140566226282240 deprecation.py:506] From /usr/local/lib/python3.5/dist-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W1012 07:36:04.123937 140566226282240 nn_ops.py:4283] Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W1012 07:36:04.226406 140566226282240 nn_ops.py:4283] Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W1012 07:36:04.332454 140566226282240 nn_ops.py:4283] Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W1012 07:36:04.426476 1

x_cont=Tensor("layer_dropout_5/cond/Identity:0", shape=(None, None, 128), dtype=float32)
  x_ques=Tensor("layer_dropout_11/cond/Identity:0", shape=(None, None, 128), dtype=float32)
  c_mask=Tensor("batch_slice_4/Slice:0", shape=(None, None), dtype=bool)
  q_mask=Tensor("batch_slice_5/Slice:0", shape=(None, None), dtype=bool)

result Tensor("context2query_attention/concat_2:0", shape=(None, None, 512), dtype=float32)
Context_to_Query_Attention_Layer x Tensor("context2query_attention/concat_2:0", shape=(None, None, 512), dtype=float32)
conv1d x Tensor("conv1d_4_1/BiasAdd:0", shape=(None, None, 128), dtype=float32)
outputs [<tf.Tensor 'conv1d_4_1/BiasAdd:0' shape=(None, None, 128) dtype=float32>, <tf.Tensor 'layer_dropout_39/cond/Identity:0' shape=(None, None, 128) dtype=float32>, <tf.Tensor 'layer_dropout_67/cond/Identity:0' shape=(None, None, 128) dtype=float32>, <tf.Tensor 'layer_dropout_95/cond/Identity:0' shape=(None, None, 128) dtype=float32>]
output_layer x_start Tensor("concatenat

layer_normalization_93 (LayerNo (None, None, 128)    256         layer_dropout_92[0][0]           
__________________________________________________________________________________________________
layer_dropout_93 (LayerDropout) (None, None, 128)    0           depthwise_conv1d_17[2][0]        
                                                                 layer_dropout_92[0][0]           
__________________________________________________________________________________________________
layer_normalization_94 (LayerNo (None, None, 128)    256         layer_dropout_93[0][0]           
__________________________________________________________________________________________________
dropout_73 (Dropout)            (None, None, 128)    0           layer_normalization_94[0][0]     
__________________________________________________________________________________________________
layer_dropout_94 (LayerDropout) (None, None, 128)    0           multi_head_attention_7[2][0]     
          

<tensorflow.python.keras.callbacks.History at 0x7fd727c9ae48>