In [117]:
import pandas as pd
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon, init
from mxnet.gluon import nn, rnn
import gluonnlp as nlp
import jieba
import multiprocessing as mp
import time
from d2l import try_gpu
import itertools
from sklearn.metrics import accuracy_score, f1_score
# fixed random number seed
np.random.seed(9102)
mx.random.seed(9102)

In [118]:
DATA_FOLDER = 'data/'
TRAIN_DATA = 'train.csv'
WORD_EMBED = 'sgns.weibo.word' 
N_ROWS=50000
ctx = try_gpu()

In [119]:
train_df = pd.read_csv(DATA_FOLDER+TRAIN_DATA, nrows=N_ROWS, sep='|')

In [120]:
dataset =[ [row[0], row[1]] for _, row in train_df.iterrows()]
train_dataset, valid_dataset = nlp.data.train_valid_split(dataset, .2)
len(train_dataset), len(valid_dataset)

(40000, 10000)

In [121]:
def tokenizer(x):
    tweet, label = x
    word_list = jieba.lcut(tweet)
    return word_list, label

def get_length(x):
    return float(len(x[0]))

def to_word_list(dataset):
    start = time.time()
    with mp.Pool() as pool:
        # Each sample is processed in an asynchronous manner.
        dataset = gluon.data.ArrayDataset(pool.map(tokenizer, dataset))
        lengths = gluon.data.ArrayDataset(pool.map(get_length, dataset))
    end = time.time()

    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
    return dataset, lengths

train_word_list, train_word_lengths = to_word_list(train_dataset)
valid_word_list, valid_word_lengths = to_word_list(valid_dataset)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Building prefix dict from the default dictionary ...
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Building prefix dict from the default dictionary ...
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix 

Done! Tokenizing Time=3.24s, #Sentences=40000


Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Building prefix dict from the default dictionary ...
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
DEBUG:jieba:Buil

Done! Tokenizing Time=2.63s, #Sentences=10000


In [122]:
train_seqs = [sample[0] for sample in train_word_list]
counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))

vocab = nlp.Vocab(counter, max_size=20000)

# load customed pre-trained embedding
embedding_weights = nlp.embedding.TokenEmbedding.from_file(file_path=DATA_FOLDER+WORD_EMBED)
vocab.set_embedding(embedding_weights)
print(vocab)

def token_to_idx(x):
    return vocab[x[0]], x[1]

# A token index or a list of token indices is returned according to the vocabulary.
with mp.Pool() as pool:
    train_dataset = pool.map(token_to_idx, train_word_list)
    valid_dataset = pool.map(token_to_idx, valid_word_list)

  .format(line_num, pretrained_file_path))


Vocab(size=20004, unk="<unk>", reserved="['<pad>', '<bos>', '<eos>']")


In [123]:
batch_size = 64
bucket_num = 10
bucket_ratio = 0.5


def get_dataloader():
    # Construct the DataLoader Pad data, stack label and lengths
    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0), \
                                          nlp.data.batchify.Stack())

    # in this example, we use a FixedBucketSampler,
    # which assigns each data sample to a fixed bucket based on its length.
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_word_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())

    # train_dataloader
    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    # valid_dataloader
    valid_dataloader = gluon.data.DataLoader(
        dataset=valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, valid_dataloader

train_dataloader, valid_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=40000, batch_num=210
  key=[18, 31, 44, 57, 70, 83, 96, 109, 122, 135]
  cnt=[32991, 3840, 1480, 740, 491, 319, 120, 11, 5, 3]
  batch_size=[240, 139, 98, 75, 64, 64, 64, 64, 64, 64]


In [124]:
for tweet, label in train_dataloader:
    print(tweet, label)
    break


[[1.570e+03 5.000e+00 2.200e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 [1.960e+02 5.160e+03 4.689e+03 ... 6.020e+02 8.000e+00 0.000e+00]
 [0.000e+00 6.830e+02 5.592e+03 ... 0.000e+00 0.000e+00 0.000e+00]
 ...
 [3.897e+03 4.000e+00 1.910e+02 ... 0.000e+00 0.000e+00 0.000e+00]
 [7.000e+00 1.600e+01 4.660e+02 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.135e+03 8.000e+00 7.600e+01 ... 0.000e+00 0.000e+00 0.000e+00]]
<NDArray 240x18 @cpu_shared(0)> 
[3 5 3 5 5 3 3 1 1 8 3 8 3 8 0 7 9 0 3 3 0 3 4 9 4 4 0 3 8 0 4 4 4 4 2 1 2
 5 4 3 8 6 4 4 2 5 3 2 5 4 5 5 6 9 9 4 5 6 3 4 3 1 1 7 1 2 5 0 5 9 0 8 9 3
 5 4 0 6 8 2 5 5 4 3 3 0 4 2 1 4 2 9 4 3 2 1 9 4 6 9 3 2 4 9 9 4 3 0 3 5 5
 4 7 1 9 1 0 2 1 9 6 0 3 2 1 4 8 2 4 2 4 9 5 8 6 8 6 5 9 8 2 0 2 7 8 9 3 7
 9 4 1 1 5 9 5 7 5 0 1 7 9 8 0 3 3 6 8 3 4 3 2 7 3 9 3 9 2 3 9 4 7 3 5 3 2
 2 5 6 3 5 4 0 5 5 3 4 2 4 6 9 4 5 9 8 1 2 2 4 8 2 2 5 8 4 8 3 3 8 0 9 1 9
 5 8 4 4 7 6 5 7 3 1 5 5 5 1 0 7 4 4]
<NDArray 240 @cpu_shared(0)>


## Model contruction
Self attention layer, weighted cross entropy, and whole model

In [125]:
# custom attention layer
# in this class, we want to implement the operation:
# softmax(W_2 * tanh(W_1 * H))
# where H is the word embedding of the whole sentence, of shape (num_of_word, embed_size)
class SelfAttention(nn.HybridBlock):
    def __init__(self, att_unit, att_hops, **kwargs):
        super(SelfAttention, self).__init__(**kwargs)
        with self.name_scope():
            # this layer is tanh(w_1 * H), the att_unit corresponds to d_a in the essay
            self.ut_dense = nn.Dense(att_unit, activation='tanh', flatten=False)
            # this layer implements the multiple hops
            self.et_dense = nn.Dense(att_hops, activation=None, flatten=False)

    def hybrid_forward(self, F, x): # F is the backend which implements the tensor operation
        # x shape: [batch_size, seq_len, embedding_width]
        # ut shape: [batch_size, seq_len, att_unit]
        ut = self.ut_dense(x) # batch_size * seq_len [* embed_size * embed_size *] att_unit
        # et shape: [batch_size, seq_len, att_hops]
        et = self.et_dense(ut)# batch_size * seq_len [* att_unit * att_unit *] att_hops

        # att shape: [batch_size,  att_hops, seq_len]
        # softmax is performed along the seq_len dimension
        att = F.softmax(F.transpose(et, axes=(0, 2, 1)), axis=-1)
        # output shape [batch_size, att_hops, embedding_width]
        output = F.batch_dot(att, x)
        # output is the weighted matrix representation of the matrix
        # att is the weighted vector we use as attention
        return output, att
    
# d_a = 20, hops = 5
print(SelfAttention(20, 5))

SelfAttention(
  (ut_dense): Dense(None -> 20, Activation(tanh))
  (et_dense): Dense(None -> 5, linear)
)


In [126]:
class WeightedSoftmaxCE(nn.HybridBlock):
    def __init__(self, sparse_label=True, from_logits=False,  **kwargs):
        super(WeightedSoftmaxCE, self).__init__(**kwargs)
        with self.name_scope():
            self.sparse_label = sparse_label
            self.from_logits = from_logits

    def hybrid_forward(self, F, pred, label, class_weight, depth=None):
        if self.sparse_label:
            label = F.reshape(label, shape=(-1, ))
            label = F.one_hot(label, depth)
        if not self.from_logits:
            pred = F.log_softmax(pred, -1)

        weight_label = F.broadcast_mul(label, class_weight)
        loss = -F.sum(pred * weight_label, axis=-1)

        # return F.mean(loss, axis=0, exclude=True)
        return loss

In [127]:
class SelfAttentiveBiLSTM(nn.HybridBlock):
    def __init__(self, vocab_len, embsize, nhidden, nlayers, natt_unit, natt_hops, \
                 nfc, nclass, # these two params are not used currrently
                 drop_prob, pool_way, prune_p=None, prune_q=None, **kwargs):
        super(SelfAttentiveBiLSTM, self).__init__(**kwargs)
        with self.name_scope():
            # now we switch back to shared layers
            self.embedding_layer = nn.Embedding(vocab_len, embsize)
            
            self.bilstm = rnn.LSTM(nhidden, num_layers=nlayers, dropout=drop_prob, \
                                        bidirectional=True)
            
            self.att_encoder = SelfAttention(natt_unit, natt_hops)
            self.dense = nn.Dense(nfc, activation='tanh')
            # this layer is used to output the final class
            self.output_layer = nn.Dense(nclass)
            
            self.dense_p, self.dense_q = None, None
            if all([prune_p, prune_q]):
                self.dense_p = nn.Dense(prune_p, activation='tanh', flatten=False)
                self.dense_q = nn.Dense(prune_q, activation='tanh', flatten=False)

            self.drop_prob = drop_prob
            self.pool_way = pool_way

    def hybrid_forward(self, F, inp):
        # inp_embed size: [batch, seq_len, embed_size]
        inp_embed = self.embedding_layer(inp)
        # rnn requires the first dimension to be the time steps
        h_output = self.bilstm(F.transpose(inp_embed, axes=(1, 0, 2)))
        # att_output: [batch, att_hops, emsize]
        output, att = self.att_encoder(F.transpose(h_output, axes=(1, 0, 2)))
        '''
        FIXME: now this code will only work with flatten
        '''
        dense_input = None
        if self.pool_way == 'flatten':
            dense_input = F.Dropout(F.flatten(output), self.drop_prob)
        else:
            raise NotImplemented
        '''
        elif self.pool_way == 'mean':
            dense_input = F.Dropout(F.mean(att_output, axis=1), self.drop_prob)
        elif self.pool_way == 'prune' and all([self.dense_p, self.dense_q]):
            # p_section: [batch, att_hops, prune_p]
            p_section = self.dense_p(att_output)
            # q_section: [batch, emsize, prune_q]
            q_section = self.dense_q(F.transpose(att_output, axes=(0, 2, 1)))
            dense_input = F.Dropout(F.concat(F.flatten(p_section), F.flatten(q_section), dim=-1), self.drop_prob)
        '''
        dense_out = self.dense(dense_input)
        output = self.output_layer(F.Dropout(dense_out, self.drop_prob))

        return output, att

In [128]:
vocab_len = len(vocab)
emsize = 300   # word embedding size
nhidden = 300    # lstm hidden_dim
nlayers = 2     # lstm layers
natt_unit = 300     # the hidden_units of attention layer
natt_hops = 20    # the channels of attention
nfc = 512  # last dense layer size
nclass = 72 # we have 72 emoji in total

drop_prob = 0
pool_way = 'flatten'    # # The way to handle M
prune_p = None
prune_q = None

ctx = try_gpu()

model = SelfAttentiveBiLSTM(vocab_len, emsize, nhidden, nlayers,
                            natt_unit, natt_hops, nfc, nclass,
                            drop_prob, pool_way, prune_p, prune_q)

model.initialize(init=init.Xavier(), ctx=ctx)
model.hybridize()

# Attach a pre-trained glove word vector to the embedding layer
model.embedding_layer.weight.set_data(vocab.embedding.idx_to_vec)
# fixed the embedding layer
model.embedding_layer.collect_params().setattr('grad_req', 'null')

print(model)

SelfAttentiveBiLSTM(
  (embedding_layer): Embedding(20004 -> 300, float32)
  (bilstm): LSTM(None -> 300, TNC, num_layers=2, bidirectional)
  (att_encoder): SelfAttention(
    (ut_dense): Dense(None -> 300, Activation(tanh))
    (et_dense): Dense(None -> 20, linear)
  )
  (dense): Dense(None -> 512, Activation(tanh))
  (output_layer): Dense(None -> 72, linear)
)


In [129]:
vocab.embedding.idx_to_vec[15], vocab.embedding.token_to_idx['用']

(
 [-1.10227e-01  1.08770e-01  4.29162e-01 -2.35640e-01  1.42376e-01
   2.28883e-01  2.58560e-02  2.00537e-01  1.75788e-01  3.71191e-01
   3.62292e-01 -4.53204e-01  2.59187e-01 -2.68126e-01 -4.26229e-01
  -4.58031e-01 -2.79040e-01  2.77700e-03  3.84063e-01  1.95392e-01
   2.52946e-01 -8.82490e-02  6.61550e-02  6.06530e-02 -1.81174e-01
   1.03426e-01 -2.29491e-01  1.63837e-01 -1.18820e-02 -2.51057e-01
   3.30090e-02 -1.70570e-02 -2.50483e-01  8.84330e-02  3.87295e-01
  -8.91180e-02 -1.68160e-02 -7.72610e-02 -1.49332e-01 -3.09141e-01
  -1.70612e-01  2.17445e-01 -3.46855e-01 -6.69050e-02  2.34772e-01
  -8.18120e-02  6.82540e-01  3.94989e-01 -2.64939e-01 -1.54585e-01
   3.50940e-02 -8.46420e-02 -1.83615e-01  2.74247e-01 -3.89414e-01
   2.04440e-02  2.28073e-01  9.59450e-02  2.27668e-01  2.52022e-01
   1.24785e-01  2.00795e-01 -1.37927e-01 -2.09281e-01  9.78250e-02
  -8.04370e-02  2.25407e-01  3.29207e-01  9.87380e-02 -1.78504e-01
  -3.52110e-02 -2.15054e-01 -1.65906e-01  1.03336e-01  5.422

## Training helpers
Calculate loss, one epoch computation and top function for train and valid

In [130]:
def calculate_loss(x, y, model, loss, class_weight, penal_coeff):
    pred, att = model(x)
    y = nd.array(y.asnumpy().astype('int32')).as_in_context(ctx)
    if loss_name in ['sce', 'l1', 'l2']:
        l = loss(pred, y)
    elif loss_name == 'wsce':
        l = loss(pred, y, class_weight, class_weight.shape[0])
    else:
        raise NotImplemented
    # penalty, now we have two att's
    diversity_penalty = nd.batch_dot(att, nd.transpose(att, axes=(0, 2, 1))) - \
                        nd.eye(att.shape[1], ctx=att.context)
    l = l + penal_coeff * diversity_penalty.norm(axis=(1, 2))

    return pred, l

In [131]:
def one_epoch(data_iter, model, loss, trainer, ctx, is_train, epoch,
              penal_coeff=0.0, clip=None, class_weight=None, loss_name='sce'):

    loss_val = 0.
    total_pred = []
    total_true = []
    n_batch = 0

    for batch_x, batch_y in data_iter:
        batch_x = batch_x.as_in_context(ctx)
        batch_y = batch_y.as_in_context(ctx)

        if is_train:
            with autograd.record():
                batch_pred, l = calculate_loss(batch_x, batch_y, model, \
                                               loss, class_weight, penal_coeff)

            # backward calculate
            l.backward()

            # clip gradient
            clip_params = [p.data() for p in model.collect_params().values()]
            if clip is not None:
                norm = nd.array([0.0], ctx)
                for param in clip_params:
                    if param.grad is not None:
                        norm += (param.grad ** 2).sum()
                norm = norm.sqrt().asscalar()
                if norm > clip:
                    for param in clip_params:
                        if param.grad is not None:
                            param.grad[:] *= clip / norm

            # update parmas
            trainer.step(batch_x.shape[0])

        else:
            batch_pred, l = calculate_loss(batch_x, batch_y, model, \
                                           loss, class_weight, penal_coeff)

        # keep result for metric
        batch_pred = nd.argmax(nd.softmax(batch_pred, axis=1), axis=1).asnumpy()
        batch_true = np.reshape(batch_y.asnumpy(), (-1, ))
        total_pred.extend(batch_pred.tolist())
        total_true.extend(batch_true.tolist())
        
        batch_loss = l.mean().asscalar()

        n_batch += 1
        loss_val += batch_loss

        # check the result of traing phase
        if is_train and n_batch % 400 == 0:
            print('epoch %d, batch %d, batch_train_loss %.4f, batch_train_acc %.3f' %
                  (epoch, n_batch, batch_loss, accuracy_score(batch_true, batch_pred)))

    # metric
    F1 = f1_score(np.array(total_true), np.array(total_pred), average='weighted')
    acc = accuracy_score(np.array(total_true), np.array(total_pred))
    loss_val /= n_batch

    if is_train:
        print('epoch %d, learning_rate %.5f \n\t train_loss %.4f, acc_train %.3f, F1_train %.3f, ' %
              (epoch, trainer.learning_rate, loss_val, acc, F1))
        # declay lr
        if epoch % 3 == 0:
            trainer.set_learning_rate(trainer.learning_rate * 0.9)
    else:
        print('\t valid_loss %.4f, acc_valid %.3f, F1_valid %.3f, ' % (loss_val, acc, F1))

In [132]:
def train_valid(data_iter_train, data_iter_valid, model, loss, trainer, ctx, nepochs,
                penal_coeff=0.0, clip=None, class_weight=None, loss_name='sce'):

    for epoch in range(1, nepochs+1):
        start = time.time()
        # train
        is_train = True
        one_epoch(data_iter_train, model, loss, trainer, ctx, is_train,
                  epoch, penal_coeff, clip, class_weight, loss_name)

        # valid
        is_train = False
        one_epoch(data_iter_valid, model, loss, trainer, ctx, is_train,
                  epoch, penal_coeff, clip, class_weight, loss_name)
        end = time.time()
        print('time %.2f sec' % (end-start))
        print("*"*100)

## Train
Now we will train this model

In [135]:
class_weight = None
loss_name = 'sce'
optim = 'adam'
lr = 0.001
penal_coeff = 0.003
clip = .5
nepochs = 40

trainer = gluon.Trainer(model.collect_params(), optim, {'learning_rate': lr})

if loss_name == 'sce':
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
elif loss_name == 'wsce':
    loss = WeightedSoftmaxCE()
    # the value of class_weight is obtained by counting data in advance. It can be seen as a hyperparameter.
    class_weight = nd.array([1., 3.], ctx=ctx)
elif loss_name == 'l1':
    loss = gluon.loss.L1Loss()
elif loss_name == 'l2':
    loss = gluon.loss.L2Loss()

In [None]:
# train and valid
train_valid(train_dataloader, valid_dataloader, model, loss, \
            trainer, ctx, nepochs, penal_coeff=penal_coeff, \
            clip=clip, class_weight=class_weight, loss_name=loss_name)

epoch 1, learning_rate 0.00100 
	 train_loss 2.7377, acc_train 0.142, F1_train 0.093, 
	 valid_loss 2.5979, acc_valid 0.176, F1_valid 0.052, 
time 15.68 sec
****************************************************************************************************
epoch 2, learning_rate 0.00100 
	 train_loss 2.8613, acc_train 0.127, F1_train 0.092, 
	 valid_loss 2.2557, acc_valid 0.176, F1_valid 0.052, 
time 15.56 sec
****************************************************************************************************
epoch 3, learning_rate 0.00100 
	 train_loss 2.4439, acc_train 0.143, F1_train 0.094, 
	 valid_loss 3.8958, acc_valid 0.176, F1_valid 0.052, 
time 15.58 sec
****************************************************************************************************
epoch 4, learning_rate 0.00090 
	 train_loss 3.8203, acc_train 0.133, F1_train 0.092, 
	 valid_loss 2.3923, acc_valid 0.176, F1_valid 0.052, 
time 15.57 sec
*********************************************************************

epoch 33, learning_rate 0.00035 
	 train_loss 2.3522, acc_train 0.158, F1_train 0.072, 
	 valid_loss 2.2862, acc_valid 0.176, F1_valid 0.052, 
time 15.78 sec
****************************************************************************************************
epoch 34, learning_rate 0.00031 
	 train_loss 2.3597, acc_train 0.142, F1_train 0.073, 
	 valid_loss 2.2412, acc_valid 0.176, F1_valid 0.052, 
time 15.68 sec
****************************************************************************************************
epoch 35, learning_rate 0.00031 
	 train_loss 2.3695, acc_train 0.150, F1_train 0.064, 
	 valid_loss 2.2397, acc_valid 0.176, F1_valid 0.052, 
time 15.73 sec
****************************************************************************************************
epoch 36, learning_rate 0.00031 
	 train_loss 2.3329, acc_train 0.164, F1_train 0.067, 
	 valid_loss 2.8890, acc_valid 0.100, F1_valid 0.018, 
time 15.74 sec
*****************************************************************