In [1]:
import tensorflow as tf
import numpy as np
import random
import copy
from a2_base_model import BaseClass
from a2_encoder import Encoder
import os

  from ._conv import register_converters as _register_converters


input_x: Tensor("input_x:0", shape=(128, 50), dtype=int32)
get_mask==>result: Tensor("mul:0", shape=(50, 50), dtype=float32)


In [2]:
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
TOKEN="article"
MAX_SEQ_LEN=1500
train_X=np.load(f'../input/{TOKEN}_train_X.npy')
train_y=np.load(f'../input/{TOKEN}_train_y.npy')
train_len=np.load(f'../input/{TOKEN}_train_len.npy')
dev_X=np.load(f'../input/{TOKEN}_dev_X.npy')
dev_y=np.load(f'../input/{TOKEN}_dev_y.npy')
dev_len=np.load(f'../input/{TOKEN}_dev_len.npy')
embedding=np.load(f'../input/{TOKEN}_embedding.npy')
valid_char=np.load(f'../input/{TOKEN}_valid_char.npy')
labels=[]
for i in range(1,20):
    labels.append(str(i))

In [78]:
class Transformer(BaseClass):
    def __init__(self, num_classes, learning_rate, batch_size, decay_steps, decay_rate, sequence_length,
                 vocab_size,embedding_mat,embed_size,d_model,d_k,d_v,h,num_layer,is_training,
                 initializer=tf.random_normal_initializer(stddev=0.1),clip_gradients=5.0,l2_lambda=0.0001,use_residual_conn=False):
        """init all hyperparameter here"""
        super(Transformer, self).__init__(d_model, d_k, d_v, sequence_length, h, batch_size, num_layer=num_layer) #init some fields by using parent class.

        self.num_classes = num_classes
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_size = d_model
        self.learning_rate = tf.Variable(learning_rate, trainable=False, name="learning_rate")
        self.learning_rate_decay_half_op = tf.assign(self.learning_rate, self.learning_rate * 0.5)
        self.initializer = initializer
        self.clip_gradients=clip_gradients
        self.l2_lambda=l2_lambda

        self.is_training=is_training #self.is_training=tf.placeholder(tf.bool,name="is_training") #tf.bool #is_training
        self.input_x = tf.placeholder(tf.int32, [self.batch_size, self.sequence_length], name="input_x")                 #x  batch_size
        self.input_y_label = tf.placeholder(tf.int32, [self.batch_size,num_classes], name="input_y_label")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        self.global_step = tf.Variable(0, trainable=False, name="Global_Step")
        self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step")
        self.epoch_increment = tf.assign(self.epoch_step, tf.add(self.epoch_step, tf.constant(1)))
        self.decay_steps, self.decay_rate = decay_steps, decay_rate
        self.use_residual_conn=use_residual_conn

        self.instantiate_weights(embedding_mat)
        self.logits = self.inference() #logits shape:[batch_size,self.num_classes]

        self.predictions = tf.argmax(self.logits, axis=1, name="predictions")
        self.pred=tf.cast(self.predictions, tf.int32)
        self.true=tf.cast(tf.argmax(self.input_y_label,1),tf.int32)
        correct_prediction = tf.equal(self.pred,self.true)
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="Accuracy")  # shape=()
        if self.is_training is False:# if it is not training, then no need to calculate loss and back-propagation.
            return
        print('logits',self.logits.shape)
        print('labels',self.input_y_label.shape)
        self.loss_val = self.loss()
        self.train_op = self.train()

    def inference(self):
        """ building blocks:
        encoder:6 layers.each layers has two   sub-layers. the first is multi-head self-attention mechanism; the second is position-wise fully connected feed-forward network.
               for each sublayer. use LayerNorm(x+Sublayer(x)). all dimension=512.
        decoder:6 layers.each layers has three sub-layers. the second layer is performs multi-head attention over the ouput of the encoder stack.
               for each sublayer. use LayerNorm(x+Sublayer(x)).
        """
        # 1.embedding for encoder input & decoder input
        # 1.1 position embedding for encoder input
        input_x_embeded = tf.nn.embedding_lookup(self.Embedding,self.input_x)  #[None,sequence_length, embed_size]
        input_x_embeded=tf.multiply(input_x_embeded,tf.sqrt(tf.cast(self.d_model,dtype=tf.float32)))
        input_mask=tf.get_variable("input_mask",[self.sequence_length,1],initializer=self.initializer)
        input_x_embeded=tf.add(input_x_embeded,input_mask) #[None,sequence_length,embed_size].position embedding.

        # 2. encoder
        encoder_class=Encoder(self.d_model,self.d_k,self.d_v,self.sequence_length,self.h,self.batch_size,self.num_layer,input_x_embeded,input_x_embeded,dropout_keep_prob=self.dropout_keep_prob,use_residual_conn=self.use_residual_conn)
        Q_encoded,K_encoded = encoder_class.encoder_fn() #K_v_encoder

        Q_encoded=tf.reshape(Q_encoded,shape=(self.batch_size,-1)) #[batch_size,sequence_length*d_model]
        with tf.variable_scope("output"):
            logits = tf.matmul(Q_encoded, self.W_projection) + self.b_projection #logits shape:[batch_size*decoder_sent_length,self.num_classes]
        #print("logits:",logits)
        return logits

    def loss(self, l2_lambda=0.0001):  # 0.001
        with tf.name_scope("loss"):
            # input: `logits`:[batch_size, num_classes], and `labels`:[batch_size]
            # output: A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the softmax cross entropy loss.
            losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y_label,logits=self.logits);  # sigmoid_cross_entropy_with_logits.#losses=tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y,logits=self.logits)
            # print("1.sparse_softmax_cross_entropy_with_logits.losses:",losses) # shape=(?,)
            loss = tf.reduce_mean(losses)  # print("2.loss.loss:", loss) #shape=()
            l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if ('bias' not in v.name ) and ('alpha' not in v.name)]) * l2_lambda
            loss = loss + l2_losses
        return loss

    def train(self):
        """based on the loss, use SGD to update parameter"""
        learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps,self.decay_rate, staircase=True)
        self.learning_rate_=learning_rate
        #noise_std_dev = tf.constant(0.3) / (tf.sqrt(tf.cast(tf.constant(1) + self.global_step, tf.float32))) #gradient_noise_scale=noise_std_dev
        train_op = tf.contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step,
                                                   learning_rate=learning_rate, optimizer="Adam",clip_gradients=self.clip_gradients)
        return train_op

    def instantiate_weights(self,embedding_mat):
        """define all weights here"""
        with tf.variable_scope("embedding_projection"):  # embedding matrix
            self.Embedding = tf.get_variable("Embedding", shape=[self.vocab_size, self.embed_size],initializer=self.initializer)
            #self.Embedding = tf.Variable(embedding_mat+1,name='dynamic_W',dtype=tf.float32)
            #self.Embedding_label = tf.get_variable("Embedding_label", shape=[self.num_classes, self.embed_size],dtype=tf.float32) #,initializer=self.initializer
            self.W_projection = tf.get_variable("W_projection", shape=[self.sequence_length*self.d_model, self.num_classes],initializer=self.initializer)  # [embed_size,label_size]
            self.b_projection = tf.get_variable("b_projection", shape=[self.num_classes])

    def get_mask(self,sequence_length):
        lower_triangle = tf.matrix_band_part(tf.ones([sequence_length, sequence_length]), -1, 0)
        result = -1e9 * (1.0 - lower_triangle)
        print("get_mask==>result:", result)
        return result

In [79]:
def batch_iter(X,y,batch_size,num_epochs,shuffle=True):
    data_size=len(X)
    num_batches_per_epoch=int((data_size-1)/batch_size)+1
    for epoch in range(num_epochs):
        # shuffle the data at each epoch
        if shuffle:
            shuffle_indices=np.random.permutation(np.arange(data_size))
            X=X[shuffle_indices]
            y=y[shuffle_indices]
        for batch_num in range(num_batches_per_epoch):
            start_index=batch_num*batch_size
            end_index=(batch_num+1)*batch_size
            if end_index>data_size:
                continue
            yield X[start_index:end_index],y[start_index:end_index]

In [86]:
def train_model(rain_X,train_y,dev_X,dev_y,batch_size,epoch):
    with tf.Graph().as_default():
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
        sess=tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        with sess.as_default():
            model=Transformer(num_classes=19,
                              learning_rate=0.01,
                              batch_size=64,
                              decay_steps=int(len(train_X)/batch_size),
                              decay_rate=0.9,
                              sequence_length=MAX_SEQ_LEN,
                             vocab_size=len(valid_char),
                              embedding_mat=embedding,
                              embed_size=512,
                              d_model=512,d_k=64,d_v=64,h=8,
                              num_layer=2,is_training=True,
                              use_residual_conn=True)
            sess.run(tf.global_variables_initializer())
            def train_step(x_batch,y_batch):
                feed_dict={
                    model.input_x:x_batch,
                    model.input_y_label:y_batch,
                    model.dropout_keep_prob:0.8
                }
                _,step,loss=sess.run(
                    [model.train_op,model.global_step,model.loss_val],feed_dict)
            def dev_step(x_batch,y_batch):
                feed_dict={
                    model.input_x:x_batch,
                    model.input_y_label:y_batch,
                    model.dropout_keep_prob:1.0
                }
                step,loss,y_pred,y_true=sess.run(
                    [model.global_step,model.loss_val,model.pred,model.true],feed_dict)
                #logger.info("step{},loss {:g}  acc:{}".format(step,loss,acc))
                return y_pred,y_true

            batches=batch_iter(
                train_X,train_y,batch_size,epoch,shuffle=True)
            num_batches_per_epoch=int((len(train_X)-1)/batch_size)+1
            num_epoch=0
            for (x_batch,y_batch) in batches:
                train_step(x_batch,y_batch)
                current_step=tf.train.global_step(sess,model.global_step)
                #saver.save(sess,checkpoint_prefix,global_step=current_step)
                if current_step%num_batches_per_epoch==0:
                    num_epoch+=1
                    dev_batches=batch_iter(dev_X,dev_y,64,1,False)
                    pred=[]
                    true=[]
                    for (x_bat,y_bat) in dev_batches:
                        y_pred,y_true=dev_step(x_bat,y_bat)
                        pred+=y_pred.tolist()
                        true+=y_true.tolist()
                    f1=f1_score(true,pred,average='weighted')
                    acc=accuracy_score(true,pred)
                    logger.info(f'{num_epoch}epoch: f1:{f1}   acc:{acc}')

In [83]:
from sklearn.metrics import f1_score,accuracy_score

In [87]:
train_model(train_X,train_y,dev_X,dev_y,64,50)

encoder_fn.started.
MultiHeadAttention.self.dropout_rate: Tensor("base_mode_sub_layer_multi_head_attention_encoder0/sub:0", dtype=float32)
self.sequence_length: 1500
LayerNormResidualConnection.use_residual_conn: True
output_conv1: Tensor("sub_layer_postion_wise_feed_forwardencoder0/conv1:0", shape=(64, 1500, 1, 1), dtype=float32)
LayerNormResidualConnection.use_residual_conn: True
encoder_fn. 0 .Q: Tensor("layer_normalization0encoder_postion_wise_ff/add_1:0", shape=(64, 1500, 512), dtype=float32) ;K_s: Tensor("layer_normalization0encoder_postion_wise_ff/add_1:0", shape=(64, 1500, 512), dtype=float32)
MultiHeadAttention.self.dropout_rate: Tensor("base_mode_sub_layer_multi_head_attention_encoder1/sub:0", dtype=float32)
self.sequence_length: 1500
LayerNormResidualConnection.use_residual_conn: True
output_conv1: Tensor("sub_layer_postion_wise_feed_forwardencoder1/conv1:0", shape=(64, 1500, 1, 1), dtype=float32)
LayerNormResidualConnection.use_residual_conn: True
encoder_fn. 1 .Q: Tensor("

2018-08-15 15:17:33,076:INFO:1epoch: f1:0.441113949265775   acc:0.4406446540880503
2018-08-15 15:19:22,304:INFO:2epoch: f1:0.4949696738299477   acc:0.451749213836478
2018-08-15 15:21:11,283:INFO:3epoch: f1:0.49357964035325363   acc:0.5003930817610063
2018-08-15 15:23:00,344:INFO:4epoch: f1:0.46649731457193955   acc:0.48653694968553457
2018-08-15 15:24:49,309:INFO:5epoch: f1:0.5073927077403181   acc:0.500687893081761
2018-08-15 15:26:38,444:INFO:6epoch: f1:0.5709372332392986   acc:0.5771422955974843
2018-08-15 15:28:27,480:INFO:7epoch: f1:0.5917240251242682   acc:0.597189465408805
2018-08-15 15:30:16,865:INFO:8epoch: f1:0.601667710738157   acc:0.6132075471698113
2018-08-15 15:32:06,023:INFO:9epoch: f1:0.5876701763486154   acc:0.5994496855345912
2018-08-15 15:33:55,441:INFO:10epoch: f1:0.5889426481976898   acc:0.5838246855345912
2018-08-15 15:35:44,678:INFO:11epoch: f1:0.5986425924521063   acc:0.6099646226415094
2018-08-15 15:37:33,701:INFO:12epoch: f1:0.6259269268137746   acc:0.61930031

KeyboardInterrupt: 

In [None]:
help(tf.random_normal_initializer())