In [2]:
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb


max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
#model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
'''model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=[x_test, y_test])'''

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)
Train...


'model.fit(x_train, y_train,\n          batch_size=batch_size,\n          epochs=4,\n          validation_data=[x_test, y_test])'

In [3]:
train_x = x_train
train_y = y_train
dev_x = x_test
dev_y = y_test

from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder()
train_y = np.array(train_y).reshape(-1, 1)
train_y = oh_enc.fit_transform(train_y).toarray()
dev_y = np.array(dev_y).reshape(-1, 1)
dev_y = oh_enc.fit_transform(dev_y).toarray()

train_x.shape, train_y.shape, dev_x.shape, dev_y.shape

((25000, 100), (25000, 2), (25000, 100), (25000, 2))

In [88]:
import configparser

cf = configparser.ConfigParser()
cf.add_section("network")
cf.set("network", 'max_seq_len', str(maxlen))
cf.set("network", 'embedding_dim', str(128))
cf.set("network", 'feature_num', str(max_features))
cf.set("network", 'num_classes', '2')
cf.set("network", 'hidden_sizes', "64, 32")
cf.set("network", 'hidden_size_sum', "96")
cf.set("network", 'learning_rate', '0.00001')
cf.add_section("train")
cf.set("train", 'epochs', '2000')
cf.set("train", 'batch_size', str(batch_size))
cf.set("train", 'dropout_proba_val', '0.5')
cf.set("train", 'dev_epoch', '1')
cf.add_section("self-attention")
cf.set("self-attention", 'key_dim', str(160))
cf.set("self-attention", 'que_dim', str(160))
cf.set("self-attention", 'val_dim', str(160))
cf.add_section("multi-head-self-attention")
cf.set("multi-head-self-attention", 'head_num', '8')
cf.set("multi-head-self-attention", 'mh_output_dim', '150')


with open("config.ini", 'w') as f:
    cf.write(f)


In [89]:
from sklearn.metrics import f1_score
from datetime import datetime

In [None]:
import tensorflow as tf
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

class BiLSTMAttentionEncoder():
    def __init__(self, cf_parser, sum_root_dir = "tf_logs"):
        self.cf_parser = cf_parser
        self.max_seq_len = int(cf_parser["network"]['max_seq_len'])
        self.embedding_dim = int(cf_parser["network"]['embedding_dim'])
        self.feature_num = int(cf_parser["network"]['feature_num'])
        self.num_classes = int(cf_parser["network"]["num_classes"])
        self.hidden_sizes = [int(x) for x in cf_parser["network"]["hidden_sizes"].split(',')]
        self.hidden_size_sum = int(cf_parser["network"]["hidden_size_sum"])
        self.learning_rate = float(cf_parser['network']['learning_rate'])
        self.epochs = int(cf_parser['train']['epochs'])
        self.batch_size = int(cf_parser['train']['batch_size'])
        self.dropout_proba_val = float(cf_parser['train']['dropout_proba_val'])
        self.dev_epoch = int(cf_parser['train']['dev_epoch'])
        self.sa_key_dim = int(cf_parser['self-attention']['key_dim'])
        self.sa_que_dim = int(cf_parser['self-attention']['que_dim'])
        self.sa_val_dim = int(cf_parser['self-attention']['val_dim'])
        self.head_num = int(cf_parser['multi-head-self-attention']['head_num'])
        self.mh_output_dim = int(cf_parser['multi-head-self-attention']['mh_output_dim'])
        self.graph = tf.Graph()
        self.log_dir = self._log_dir(sum_root_dir)
        
    def _log_dir(self, root_logdir):
        now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
        log_dir = "{}/run-{}/".format(root_logdir, now)
        return log_dir
    
    def attention(self, H):
        with tf.name_scope('attention'):
            hidden_size = 2 * self.hidden_size_sum
            attention_w = tf.Variable(tf.random_uniform([hidden_size],
                                                            minval = 0,
                                                            maxval = 1),
                                          name = "attention_weight")
            step_h = tf.reshape(tf.tanh(H), [-1, hidden_size])
            score = tf.matmul(step_h, tf.reshape(attention_w,
                                                          [-1, 1]))
            score = tf.reshape(score, [-1, self.max_seq_len])
            alpha = tf.nn.softmax(score)
            weight_sum_output = tf.matmul(
                tf.transpose(H, [0, 2, 1]),
                tf.reshape(alpha, [-1, self.max_seq_len, 1]))
            weight_sum_output = tf.tanh(tf.reshape(weight_sum_output,
                                               [-1, hidden_size]))
            attentioned_output = tf.nn.dropout(
                weight_sum_output, self.dropout_proba)
            return attentioned_output
    
    def self_attention(self, H, scope, dropout = True, return_seq = False):
        scale_val = np.ceil(np.sqrt(self.sa_que_dim*self.sa_key_dim))
        H = tf.tanh(H)
        with tf.name_scope(scope):
            weight_dim = 2 * self.hidden_size_sum
            query_w = tf.Variable(tf.random_uniform(shape = [weight_dim,
                                                            self.sa_que_dim],
                                                   minval = 0, maxval = 1),
                                 name = 'query_w')
            key_w = tf.Variable(tf.random_uniform(shape = [weight_dim,
                                                          self.sa_key_dim],
                                                 minval = 0, maxval = 1),
                               name = 'key_w')
            value_w = tf.Variable(tf.random_uniform(shape = [weight_dim,
                                                            self.sa_val_dim],
                                                   minval = 0, maxval = 1),
                                 name = 'value_w')
            query = tf.matmul(H, query_w)
            key = tf.matmul(H, key_w)
            value = tf.matmul(H, value_w)
            score = tf.matmul(query, tf.transpose(key, perm = [0, 2, 1])) / scale_val
            alpha = tf.nn.softmax(score)
            if(return_seq):
                return tf.matmul(alpha, value)
            weight_sum_output = tf.tanh(tf.reduce_mean(tf.matmul(alpha, value),
                                                  axis = 1))
            if(dropout):
                self_attention_output = tf.nn.dropout(
                    weight_sum_output, self.dropout_proba)
            else:
                self_attention_output = weight_sum_output
            return self_attention_output
        
    def multi_head_self_attention(self, H, head_num, dropout = True, return_seq = False):
        attention_outputs = []
        mh_dim = self.head_num * self.sa_val_dim
        mh_weight = tf.Variable(tf.random_uniform(shape = [mh_dim, self.mh_output_dim],
                                                 minval = 0, maxval = 1),
                               name = 'mh_weight')
        H = tf.tanh(H)
        for index in range(head_num):
            attention_outputs.append(self.self_attention(H, 'self_attention' + str(index), 
                                                         dropout = False, return_seq = True))
        mh_attention_output = tf.matmul(tf.concat(attention_outputs, -1), mh_weight)
        if(return_seq):
            return mh_attention_output
        mh_attention_output = tf.tanh(tf.reduce_mean(mh_attention_output, axis = 1))
        if(dropout):
            mh_attention_output = tf.nn.dropout(
                mh_attention_output, self.dropout_proba)
        return mh_attention_output
        
                
    def build(self):
        with tf.name_scope("bilstm_attention"), self.graph.as_default():
            with tf.name_scope("input"):
                self.input_x = tf.placeholder(dtype = tf.int32, 
                                             shape = [None, self.max_seq_len],
                                             name = 'input_x')
                self.input_y = tf.placeholder(dtype = tf.int32,
                                             shape = [None, self.num_classes],
                                             name = 'input_y')
                self.dropout_proba = tf.placeholder(tf.float32)

            with tf.name_scope("embedding"), tf.device("/cpu:0"):
                self.embedding_matrix = tf.Variable(tf.random_uniform(
                                                        dtype = tf.float32,
                                                        shape = [self.feature_num, self.embedding_dim],
                                                        minval = 0, maxval = 1),
                                                   name = "embedding_matrix")
                self.embedded = tf.nn.embedding_lookup(self.embedding_matrix,
                                                      self.input_x)
            with tf.name_scope("biLSTM"):
                self.fw_outputs = []
                self.bw_outputs = []
                self.fw_final_outputs = []
                self.bw_final_outputs = []
                for index, hidden_size in enumerate(self.hidden_sizes):
                    with tf.variable_scope("bilstm" + str(index), reuse = tf.AUTO_REUSE):
                        fwlstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                            tf.nn.rnn_cell.LSTMCell(num_units = hidden_size,
                                                    state_is_tuple = True),
                            output_keep_prob = self.dropout_proba)
                        bwlstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                            tf.nn.rnn_cell.LSTMCell(num_units = hidden_size,
                                                    state_is_tuple = True),
                            output_keep_prob = self.dropout_proba)
                        output, state = tf.nn.bidirectional_dynamic_rnn(
                            fwlstm_cell, bwlstm_cell, self.embedded,
                            dtype = tf.float32, )
                        self.fw_final_outputs.append(output[0][:, -1, :])
                        self.bw_final_outputs.append(output[1][:, 0, :])
                        self.fw_outputs.append(output[0])
                        self.bw_outputs.append(tf.reverse(output[1], [1]))
                self.bilstm_final_fw_output = tf.concat(self.fw_final_outputs,
                                                       -1)
                self.bilstm_final_bw_output = tf.concat(self.bw_final_outputs,
                                                       -1)
                self.bilstm_final_output = tf.concat([self.bilstm_final_fw_output, 
                                                self.bilstm_final_bw_output], -1, 
                                               name = "bilstm_final_output")
                self.bilstm_fw_output = tf.concat(self.fw_outputs,
                                                 -1)
                self.bilstm_bw_output = tf.concat(self.bw_outputs,
                                                 -1)
                self.bilstm_output = tf.concat([self.bilstm_fw_output,
                                               self.bilstm_bw_output],
                                              -1)
                #self.attention_output = self.attention(self.bilstm_output)
                self.attention_output = self.self_attention(self.bilstm_output,
                                                           'self_attention')
                #self.attention_output = self.multi_head_self_attention(self.bilstm_output, 
                                                                            #self.head_num)
                print(self.attention_output.shape)
            with tf.name_scope("output"):
                #self.bilstm_output_dim = 2 * self.hidden_size_sum
                self.bilstm_output_dim = self.sa_val_dim
                #self.bilstm_output_dim = self.mh_output_dim
                self.output_w = tf.Variable(tf.truncated_normal(
                    [self.bilstm_output_dim, self.num_classes], stddev = 0.1))
                self.output_bias = tf.Variable(
                    tf.truncated_normal([self.num_classes], stddev = 0.1))
                self.logits = tf.nn.xw_plus_b(self.attention_output,
                                             self.output_w, self.output_bias,
                                             name = 'logit')
                self.logits = tf.nn.sigmoid(self.logits)
                self.prediction = tf.argmax(self.logits, axis = 1, name = 
                                           'prediction')

            with tf.name_scope("loss"):
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                        logits = self.logits, labels = self.input_y)

                self.loss = tf.reduce_mean(self.loss)
                
            with tf.name_scope("target"):
                self.correct = tf.equal(self.prediction, tf.arg_max(self.input_y, 1), name = "correct")
                self.acc = tf.reduce_mean(tf.cast(self.correct, tf.float32), name = "acc")

            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.train_step = self.optimizer.minimize(self.loss)
            
            with tf.name_scope("summary"):
                self.loss_sum = tf.summary.scalar('loss', self.loss)
                self.acc_sum = tf.summary.scalar('acc', self.acc)
                self.sum = tf.summary.merge_all()
                self.filewriter = tf.summary.FileWriter(self.log_dir, self.graph)
                
    def fit(self, train_x, train_y, dev_x  = None, dev_y = None, test_x = None):
        self.train_x = train_x
        self.train_y = train_y
        self.dev_x = dev_x
        self.dev_y = dev_y
        self.test_x = test_x
        self.train_m = train_x.shape[0]

    def train(self):
        print('log dir ', self.log_dir)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        with tf.Session(config = config, graph = self.graph) as self.sess:
            all_train_feed_dict = {
                self.input_x : self.train_x,
                self.input_y : self.train_y,
                self.dropout_proba : 1.0
            }
            self.sess.run(tf.global_variables_initializer())
            n_batch = int(np.ceil(self.train_m / self.batch_size))
            print(n_batch)
            for epoch in range(self.epochs):
                for batch_index in range(n_batch):
                    batch_x = self.train_x[batch_index * self.batch_size : 
                                           self.batch_size * (batch_index + 1)]
                    batch_y = self.train_y[batch_index * self.batch_size : 
                                           self.batch_size * (batch_index + 1)]
                    train_feed_dict = {
                        self.input_x : batch_x,
                        self.input_y : batch_y,
                        self.dropout_proba : self.dropout_proba_val
                    }
                    self.sess.run(self.train_step, 
                                 feed_dict = train_feed_dict)
                    step = epoch * n_batch + batch_index
                if(self.dev_x is not None and self.dev_y is not None 
                   and epoch % self.dev_epoch == 0):
                    dev_feed_dict = {
                        self.input_x : self.dev_x,
                        self.input_y : self.dev_y,
                        self.dropout_proba : 1.0
                    }
                    print('epoch ', epoch)
                    print('loss', epoch, self.sess.run(self.loss, 
                                               feed_dict = all_train_feed_dict))
                    p = self.sess.run(self.prediction,
                                     feed_dict = dev_feed_dict)
                    print('dev f1', epoch, f1_score(p, np.argmax(self.dev_y, axis = 1)))
                    print('dev acc', epoch, self.acc.eval(feed_dict = dev_feed_dict))
                    
                    train_p = self.sess.run(self.prediction,
                                     feed_dict = all_train_feed_dict)
                    print('train f1', epoch, f1_score(train_p, np.argmax(self.train_y, axis = 1)))
                    print('train acc', epoch, self.acc.eval(feed_dict = all_train_feed_dict))
                    train_loss_str = self.loss_sum.eval(feed_dict = all_train_feed_dict)
                    dev_acc_str = self.acc_sum.eval(feed_dict = dev_feed_dict)
                    self.filewriter.add_summary(train_loss_str, step)
                    self.filewriter.add_summary(dev_acc_str, step)
ba = BiLSTMAttentionEncoder(cf)
ba.build()
ba.fit(train_x, train_y, dev_x, dev_y)
ba.train()

(?, 160)
log dir  tf_logs/run-20200331022511/
782
epoch  0
loss 0 0.6930129
dev f1 0 0.008090743237883715
dev acc 0 0.4998
train f1 0 0.012677283891926155
train acc 0 0.50156
epoch  1
loss 1 0.6916892
dev f1 1 0.06143085227704706
dev acc 1 0.5062
train f1 1 0.06900735572912718
train acc 1 0.50892
epoch  2
loss 2 0.6897633
dev f1 2 0.6539469673879915
dev acc 2 0.54584
train f1 2 0.6570218986994791
train acc 2 0.54956
epoch  3
loss 3 0.6839518
dev f1 3 0.6269695990811529
dev acc 3 0.58428
train f1 3 0.6350357104403689
train acc 3 0.59324
epoch  4
loss 4 0.67356116
dev f1 4 0.5462933930571109
dev acc 4 0.59484
train f1 4 0.5526835397594687
train acc 4 0.60128
epoch  5
loss 5 0.6627216
dev f1 5 0.58086491613849
dev acc 5 0.61116
train f1 5 0.5913140791116085
train acc 5 0.6202
epoch  6
loss 6 0.65252745
dev f1 6 0.6279451178047122
dev acc 6 0.62796
train f1 6 0.6400545068333935
train acc 6 0.64076
epoch  7
loss 7 0.6416616
dev f1 7 0.6379830310828742
dev acc 7 0.63988
train f1 7 0.65082178

dev f1 64 0.806013431013431
dev acc 64 0.79664
train f1 64 0.8686931268151017
train acc 64 0.86436
epoch  65
loss 65 0.44126004
dev f1 65 0.8072609550237021
dev acc 65 0.79996
train f1 65 0.8733944954128441
train acc 65 0.87028
epoch  66
loss 66 0.4350524
dev f1 66 0.8006398720255947
dev acc 66 0.8006
train f1 66 0.8770806658130602
train acc 66 0.87712
epoch  67
loss 67 0.4710639
dev f1 67 0.8059235599433778
dev acc 67 0.78612
train f1 67 0.8472997449451078
train acc 67 0.83476
epoch  68
loss 68 0.45018628
dev f1 68 0.8089761360656351
dev acc 68 0.79604
train f1 68 0.8654228475302034
train acc 68 0.8592
epoch  69
loss 69 0.44504648
dev f1 69 0.8095741467834491
dev acc 69 0.79824
train f1 69 0.8699938385705485
train acc 69 0.86496
epoch  70
loss 70 0.43793866
dev f1 70 0.8082371054657429
dev acc 70 0.80072
train f1 70 0.8759369144284822
train acc 70 0.87288
epoch  71
loss 71 0.46391156
dev f1 71 0.8065999853982624
dev acc 71 0.78808
train f1 71 0.8540105157176417
train acc 71 0.8434
epo

epoch  127
loss 127 0.41761833
dev f1 127 0.8192972459639126
dev acc 127 0.80972
train f1 127 0.8966404240392859
train acc 127 0.89392
epoch  128
loss 128 0.4084449
dev f1 128 0.8163105886028982
dev acc 128 0.81188
train f1 128 0.9045620148899097
train acc 128 0.9036
epoch  129
loss 129 0.4256465
dev f1 129 0.8190433549735096
dev acc 129 0.806
train f1 129 0.8900696931192484
train acc 129 0.8858
epoch  130
loss 130 0.41753197
dev f1 130 0.8188433260062153
dev acc 130 0.8088
train f1 130 0.8971715363965297
train acc 130 0.89428
epoch  131
loss 131 0.4143643
dev f1 131 0.8189480910789885
dev acc 131 0.81012
train f1 131 0.8997652582159624
train acc 131 0.89752
epoch  132
loss 132 0.40968302
dev f1 132 0.8183433781486633
dev acc 132 0.81192
train f1 132 0.9037340475815346
train acc 132 0.90224
epoch  133
loss 133 0.40367985
dev f1 133 0.8111798249136616
dev acc 133 0.81192
train f1 133 0.9085006612961404
train acc 133 0.90868
epoch  134
loss 134 0.40471625
dev f1 134 0.8141446068747531
de

dev f1 188 0.8195121951219512
dev acc 188 0.81944
train f1 188 0.921889373275207
train acc 188 0.92188
epoch  189
loss 189 0.3908554
dev f1 189 0.8159159765703898
dev acc 189 0.81772
train f1 189 0.9218549422336328
train acc 189 0.92208
epoch  190
loss 190 0.39836872
dev f1 190 0.8241279069767441
dev acc 190 0.81608
train f1 190 0.9153891625615763
train acc 190 0.91412
epoch  191
loss 191 0.391306
dev f1 191 0.8198280302730118
dev acc 191 0.81812
train f1 191 0.9217384363650876
train acc 191 0.92156
epoch  192
loss 192 0.39306027
dev f1 192 0.8225610473092149
dev acc 192 0.81892
train f1 192 0.9199952307142005
train acc 192 0.91948
epoch  193
loss 193 0.39079174
dev f1 193 0.8198144387369092
dev acc 193 0.819
train f1 193 0.9220441922723458
train acc 193 0.92196
epoch  194
loss 194 0.39525703
dev f1 194 0.8226490781454912
dev acc 194 0.81608
train f1 194 0.918332607576897
train acc 194 0.91748
epoch  195
loss 195 0.39502743
dev f1 195 0.8239450289219303
dev acc 195 0.8186
train f1 195 

dev acc 249 0.81972
train f1 249 0.9272402145837473
train acc 249 0.92676
epoch  250
loss 250 0.38449672
dev f1 250 0.8249669234959919
dev acc 250 0.82008
train f1 250 0.9288642968034712
train acc 250 0.92852
epoch  251
loss 251 0.4165295
dev f1 251 0.8244945795487841
dev acc 251 0.80832
train f1 251 0.8996060733544957
train acc 251 0.895
epoch  252
loss 252 0.3918678
dev f1 252 0.8270522600478197
dev acc 252 0.81772
train f1 252 0.9217254284025999
train acc 252 0.92052
epoch  253
loss 253 0.3849651
dev f1 253 0.8245118886526194
dev acc 253 0.81844
train f1 253 0.9282448817332538
train acc 253 0.9278
epoch  254
loss 254 0.3995119
dev f1 254 0.8240176468388978
dev acc 254 0.81172
train f1 254 0.9147607524783389
train acc 254 0.91264
epoch  255
loss 255 0.38356087
dev f1 255 0.8243942281513749
dev acc 255 0.8194
train f1 255 0.9295785867920019
train acc 255 0.92928
epoch  256
loss 256 0.38863313
dev f1 256 0.8045444982116559
dev acc 256 0.8142
train f1 256 0.9226146975233456
train acc 25

train acc 310 0.93688
epoch  311
loss 311 0.37711078
dev f1 311 0.8151979149698647
dev acc 311 0.81848
train f1 311 0.9356678178459561
train acc 311 0.93592
epoch  312
loss 312 0.3791107
dev f1 312 0.8119075336770332
dev acc 312 0.81904
train f1 312 0.9333870642756397
train acc 312 0.93388
epoch  313
loss 313 0.38006216
dev f1 313 0.8266893065256821
dev acc 313 0.82036
train f1 313 0.9330099789289549
train acc 313 0.9326
epoch  314
loss 314 0.37624288
dev f1 314 0.8235668030670215
dev acc 314 0.82236
train f1 314 0.9368
train acc 314 0.9368
epoch  315
loss 315 0.37880194
dev f1 315 0.8122223606094573
dev acc 315 0.81908
train f1 315 0.9339026060337536
train acc 315 0.93436
epoch  316
loss 316 0.37679648
dev f1 316 0.8151973441534488
dev acc 316 0.81964
train f1 316 0.9359309098212494
train acc 316 0.9362
epoch  317
loss 317 0.37880182
dev f1 317 0.81047267906707
dev acc 317 0.817
train f1 317 0.9338380380944711
train acc 317 0.93428
epoch  318
loss 318 0.37959337
dev f1 318 0.809332999

dev f1 372 0.8104110962566845
dev acc 372 0.81848
train f1 372 0.9377740235710551
train acc 372 0.93812
epoch  373
loss 373 0.3717491
dev f1 373 0.8207365892714171
dev acc 373 0.82088
train f1 373 0.941552986358363
train acc 373 0.94156
epoch  374
loss 374 0.3716661
dev f1 374 0.8212634822804314
dev acc 374 0.82368
train f1 374 0.9413649471323293
train acc 374 0.94144
epoch  375
loss 375 0.37331164
dev f1 375 0.8149346608587429
dev acc 375 0.82156
train f1 375 0.9394328861755964
train acc 375 0.93968
epoch  376
loss 376 0.3727297
dev f1 376 0.8153249752556911
dev acc 376 0.82088
train f1 376 0.9401380195795218
train acc 376 0.94032
epoch  377
loss 377 0.372371
dev f1 377 0.8161285061969993
dev acc 377 0.8196
train f1 377 0.9407143143464145
train acc 377 0.94084
epoch  378
loss 378 0.37127858
dev f1 378 0.8220209952720572
dev acc 378 0.82232
train f1 378 0.9418637218421158
train acc 378 0.94188
epoch  379
loss 379 0.3725231
dev f1 379 0.8150322473954026
dev acc 379 0.82104
train f1 379 

dev acc 433 0.81984
train f1 433 0.9423285307219971
train acc 433 0.94252
epoch  434
loss 434 0.36859587
dev f1 434 0.8212028542303772
dev acc 434 0.8246
train f1 434 0.9444822479285915
train acc 434 0.94452
epoch  435
loss 435 0.36936712
dev f1 435 0.8297806130387427
dev acc 435 0.82496
train f1 435 0.9440344648769396
train acc 435 0.94388
epoch  436
loss 436 0.3714494
dev f1 436 0.8291751041149276
dev acc 436 0.82116
train f1 436 0.9420237953125622
train acc 436 0.94172
epoch  437
loss 437 0.3683567
dev f1 437 0.8272313477142292
dev acc 437 0.8254
train f1 437 0.9449794222239981
train acc 437 0.94492
epoch  438
loss 438 0.3691707
dev f1 438 0.8161924703370694
dev acc 438 0.82092
train f1 438 0.9440326909979567
train acc 438 0.94412
epoch  439
loss 439 0.36826906
dev f1 439 0.8207714654615881
dev acc 439 0.82232
train f1 439 0.9450153956892071
train acc 439 0.945
epoch  440
loss 440 0.36805937
dev f1 440 0.8239323126510877
dev acc 440 0.8252
train f1 440 0.9452262913801375
train acc 4

train f1 494 0.9471161527400543
train acc 494 0.94704
epoch  495
loss 495 0.37641242
dev f1 495 0.8014424934529686
dev acc 495 0.815
train f1 495 0.9354028205543847
train acc 495 0.93624
epoch  496
loss 496 0.3655228
dev f1 496 0.8200795636303982
dev acc 496 0.82452
train f1 496 0.9477474593902536
train acc 496 0.94776
epoch  497
loss 497 0.3667868
dev f1 497 0.8147314918968461
dev acc 497 0.82212
train f1 497 0.9461365822378968
train acc 497 0.94624
epoch  498
loss 498 0.36540577
dev f1 498 0.8202247191011237
dev acc 498 0.82208
train f1 498 0.9478987564476788
train acc 498 0.94788
epoch  499
loss 499 0.36926216
dev f1 499 0.8074180953997623
dev acc 499 0.81848
train f1 499 0.9435130462750775
train acc 499 0.9438
epoch  500
loss 500 0.3671456
dev f1 500 0.8297625493077578
dev acc 500 0.82392
train f1 500 0.9463566386152434
train acc 500 0.9462
epoch  501
loss 501 0.36542284
dev f1 501 0.8223713646532438
dev acc 501 0.82532
train f1 501 0.9477979119164767
train acc 501 0.9478
epoch  50

epoch  556
loss 556 0.36522493
dev f1 556 0.81401706569464
dev acc 556 0.8204
train f1 556 0.9479504748166847
train acc 556 0.94804
epoch  557
loss 557 0.36619708
dev f1 557 0.8106877950101147
dev acc 557 0.82032
train f1 557 0.9468319890855103
train acc 557 0.947
epoch  558
loss 558 0.36369327
dev f1 558 0.8228946941424631
dev acc 558 0.82536
train f1 558 0.9495159612769022
train acc 558 0.94952
epoch  559
loss 559 0.3682209
dev f1 559 0.8068229916194024
dev acc 559 0.81744
train f1 559 0.9443997425168974
train acc 559 0.94472
epoch  560
loss 560 0.36403796
dev f1 560 0.8294549671434094
dev acc 560 0.82352
train f1 560 0.9493448386065835
train acc 560 0.94928
epoch  561
loss 561 0.3637191
dev f1 561 0.8303550526726492
dev acc 561 0.82608
train f1 561 0.9495643833426585
train acc 561 0.94952
epoch  562
loss 562 0.36751917
dev f1 562 0.8060570271638265
dev acc 562 0.81608
train f1 562 0.9452236466663987
train acc 562 0.94548
epoch  563
loss 563 0.36371619
dev f1 563 0.82147981143677
dev

dev acc 617 0.81308
train f1 617 0.9394565261410116
train acc 617 0.9402
epoch  618
loss 618 0.3624102
dev f1 618 0.826184002272635
dev acc 618 0.82868
train f1 618 0.95088
train acc 618 0.95088
epoch  619
loss 619 0.3620593
dev f1 619 0.8289264096846131
dev acc 619 0.82816
train f1 619 0.951177576072614
train acc 619 0.95116
epoch  620
loss 620 0.3621429
dev f1 620 0.8269030239833158
dev acc 620 0.82736
train f1 620 0.9511697660467906
train acc 620 0.95116
epoch  621
loss 621 0.36234835
dev f1 621 0.8195956454121307
dev acc 621 0.82368
train f1 621 0.9509101820364073
train acc 621 0.95092
epoch  622
loss 622 0.36258414
dev f1 622 0.8173200314608603
dev acc 622 0.82348
train f1 622 0.9506162958219945
train acc 622 0.95064
epoch  623
loss 623 0.36217597
dev f1 623 0.8196869479479069
dev acc 623 0.82444
train f1 623 0.9510702140428084
train acc 623 0.95108
epoch  624
loss 624 0.36306
dev f1 624 0.8113768692665354
dev acc 624 0.81988
train f1 624 0.9502483576349944
train acc 624 0.95032
e

train acc 678 0.95168
epoch  679
loss 679 0.36060983
dev f1 679 0.8258342478559526
dev acc 679 0.8286
train f1 679 0.9526362108968718
train acc 679 0.95264
epoch  680
loss 680 0.3609798
dev f1 680 0.8248743311172368
dev acc 680 0.8272
train f1 680 0.9521866122514304
train acc 680 0.9522
epoch  681
loss 681 0.36071563
dev f1 681 0.8241696227023542
dev acc 681 0.82552
train f1 681 0.9525218991240351
train acc 681 0.95252
epoch  682
loss 682 0.36067027
dev f1 682 0.8229903865821232
dev acc 682 0.82692
train f1 682 0.9525524083853417
train acc 682 0.95256
epoch  683
loss 683 0.36136487
dev f1 683 0.8334991962030162
dev acc 683 0.826
train f1 683 0.9519814637264302
train acc 683 0.95192
epoch  684
loss 684 0.36144865
dev f1 684 0.8214536589397132
dev acc 684 0.82716
train f1 684 0.9516141953056156
train acc 684 0.95168
epoch  685
loss 685 0.3684339
dev f1 685 0.8337448247046758
dev acc 685 0.82492
train f1 685 0.9451359708237532
train acc 685 0.94464
epoch  686
loss 686 0.36156452
dev f1 68

dev acc 740 0.8286
train f1 740 0.9535274356103023
train acc 740 0.95352
epoch  741
loss 741 0.35991454
dev f1 741 0.821335538556957
dev acc 741 0.82716
train f1 741 0.9533376020489834
train acc 741 0.95336
epoch  742
loss 742 0.35970843
dev f1 742 0.8271703538056066
dev acc 742 0.82688
train f1 742 0.9535692861427715
train acc 742 0.95356
epoch  743
loss 743 0.35979164
dev f1 743 0.8223818463798873
dev acc 743 0.82592
train f1 743 0.9535162813025041
train acc 743 0.95352
epoch  744
loss 744 0.36065885
dev f1 744 0.8185807733708702
dev acc 744 0.82472
train f1 744 0.9524267179240751
train acc 744 0.95248
epoch  745
loss 745 0.35968667
dev f1 745 0.8283441908796612
dev acc 745 0.8282
train f1 745 0.9535311525233944
train acc 745 0.95352
epoch  746
loss 746 0.35986403
dev f1 746 0.8218037987721972
dev acc 746 0.827
train f1 746 0.9532994517587738
train acc 746 0.95332
epoch  747
loss 747 0.37203282
dev f1 747 0.7951754577634053
dev acc 747 0.81252
train f1 747 0.9396730090470201
train ac

train acc 801 0.95452
epoch  802
loss 802 0.35888124
dev f1 802 0.8300719251014586
dev acc 802 0.83084
train f1 802 0.9543654761428628
train acc 802 0.95436
epoch  803
loss 803 0.41868296
dev f1 803 0.76603756256601
dev acc 803 0.7962
train f1 803 0.8843962821775818
train acc 803 0.89204
epoch  804
loss 804 0.3682517
dev f1 804 0.7968553185944491
dev acc 804 0.81292
train f1 804 0.9439683053040103
train acc 804 0.94456
epoch  805
loss 805 0.35888493
dev f1 805 0.8262209962680513
dev acc 805 0.82864
train f1 805 0.9544036477081834
train acc 805 0.9544
epoch  806
loss 806 0.3588748
dev f1 806 0.8278473872930179
dev acc 806 0.82908
train f1 806 0.9544072948328267
train acc 806 0.9544
epoch  807
loss 807 0.35887378
dev f1 807 0.8305341080952001
dev acc 807 0.82968
train f1 807 0.9544072948328267
train acc 807 0.9544
epoch  808
loss 808 0.35872772
dev f1 808 0.8315110516793833
dev acc 808 0.82864
train f1 808 0.9545709029832841
train acc 808 0.95456
epoch  809
loss 809 0.35877538
dev f1 809

loss 863 0.3581618
dev f1 863 0.8266365141541817
dev acc 863 0.82828
train f1 863 0.9551271796512558
train acc 863 0.95512
epoch  864
loss 864 0.35815388
dev f1 864 0.8305051224239547
dev acc 864 0.8286
train f1 864 0.9550889822035593
train acc 864 0.95508
epoch  865
loss 865 0.35854074
dev f1 865 0.8159158281491379
dev acc 865 0.82364
train f1 865 0.9547055057618438
train acc 865 0.95472
epoch  866
loss 866 0.35818306
dev f1 866 0.8287195851615476
dev acc 866 0.82824
train f1 866 0.9550889822035593
train acc 866 0.95508
epoch  867
loss 867 0.3581396
dev f1 867 0.8308324119412416
dev acc 867 0.82864
train f1 867 0.9551343570057582
train acc 867 0.95512
epoch  868
loss 868 0.35805938
dev f1 868 0.8311606722161144
dev acc 868 0.828
train f1 868 0.9552214936830321
train acc 868 0.9552
epoch  869
loss 869 0.35830745
dev f1 869 0.824679841250358
dev acc 869 0.8286
train f1 869 0.9548763901112088
train acc 869 0.95488
epoch  870
loss 870 0.35809022
dev f1 870 0.8235864689002951
dev acc 870 0

dev acc 924 0.83044
train f1 924 0.9558229720545317
train acc 924 0.9558
epoch  925
loss 925 0.35746905
dev f1 925 0.82698368372761
dev acc 925 0.82652
train f1 925 0.9558229720545317
train acc 925 0.9558
epoch  926
loss 926 0.35880014
dev f1 926 0.8355492803074872
dev acc 926 0.83228
train f1 926 0.9545182286467276
train acc 926 0.95444
epoch  927
loss 927 0.36354566
dev f1 927 0.8026952315134761
dev acc 927 0.81728
train f1 927 0.9488780566410184
train acc 927 0.94924
epoch  928
loss 928 0.35742614
dev f1 928 0.8310994096058721
dev acc 928 0.83064
train f1 928 0.9558647157591749
train acc 928 0.95584
epoch  929
loss 929 0.36537603
dev f1 929 0.8361560562528353
dev acc 929 0.82664
train f1 929 0.9481234890817581
train acc 929 0.94764
epoch  930
loss 930 0.3574405
dev f1 930 0.8271750152098966
dev acc 930 0.82956
train f1 930 0.9558229720545317
train acc 930 0.9558
epoch  931
loss 931 0.35742468
dev f1 931 0.8329149670536554
dev acc 931 0.8296
train f1 931 0.9558265040975414
train acc 

train acc 985 0.95648
epoch  986
loss 986 0.35678804
dev f1 986 0.82715647145068
dev acc 986 0.83024
train f1 986 0.9564869620860662
train acc 986 0.95648
epoch  987
loss 987 0.35679403
dev f1 987 0.829884872393527
dev acc 987 0.83096
train f1 987 0.9564487102579484
train acc 987 0.95644
epoch  988
loss 988 0.35681444
dev f1 988 0.8301354401805868
dev acc 988 0.83144
train f1 988 0.956452193385852
train acc 988 0.95644
epoch  989
loss 989 0.3568353
dev f1 989 0.826948802140338
dev acc 989 0.82924
train f1 989 0.9564104614892426
train acc 989 0.9564
epoch  990
loss 990 0.35674202
dev f1 990 0.830455071192119
dev acc 990 0.82996
train f1 990 0.9565356471670198
train acc 990 0.95652
epoch  991
loss 991 0.35676405
dev f1 991 0.8233567117925298
dev acc 991 0.82672
train f1 991 0.9565286942611477
train acc 991 0.95652
epoch  992
loss 992 0.3615943
dev f1 992 0.8043598116968489
dev acc 992 0.81548
train f1 992 0.950933075933076
train acc 992 0.9512
epoch  993
loss 993 0.3567262
dev f1 993 0.8

loss 1045 0.46033332
dev f1 1045 0.8172625873032419
dev acc 1045 0.78964
train f1 1045 0.8661355153603215
train acc 1045 0.8508
epoch  1046
loss 1046 0.35631797
dev f1 1046 0.8325324931075226
dev acc 1046 0.82992
train f1 1046 0.9569703271214908
train acc 1046 0.95696
epoch  1047
loss 1047 0.35697594
dev f1 1047 0.8158941506510908
dev acc 1047 0.82412
train f1 1047 0.9561459667093469
train acc 1047 0.95616
epoch  1048
loss 1048 0.3563719
dev f1 1048 0.8323233902810899
dev acc 1048 0.82844
train f1 1048 0.9568937939859244
train acc 1048 0.95688
epoch  1049
loss 1049 0.35630333
dev f1 1049 0.8278919790407093
dev acc 1049 0.8292
train f1 1049 0.9569703271214908
train acc 1049 0.95696
epoch  1050
loss 1050 0.35630322
dev f1 1050 0.8277197370013312
dev acc 1050 0.82916
train f1 1050 0.9569703271214908
train acc 1050 0.95696
epoch  1051
loss 1051 0.35630292
dev f1 1051 0.8276168545226842
dev acc 1051 0.82932
train f1 1051 0.9569703271214908
train acc 1051 0.95696
epoch  1052
loss 1052 0.3564

train acc 1103 0.95724
epoch  1104
loss 1104 0.3560636
dev f1 1104 0.8283905895465945
dev acc 1104 0.82756
train f1 1104 0.9572136916186821
train acc 1104 0.9572
epoch  1105
loss 1105 0.35600188
dev f1 1105 0.8286372640174638
dev acc 1105 0.83044
train f1 1105 0.9572902503399184
train acc 1105 0.95728
epoch  1106
loss 1106 0.35598126
dev f1 1106 0.8274859703661834
dev acc 1106 0.82908
train f1 1106 0.9572902503399184
train acc 1106 0.95728
epoch  1107
loss 1107 0.35595968
dev f1 1107 0.8219865153757606
dev acc 1107 0.8268
train f1 1107 0.9572868341065429
train acc 1107 0.95728
epoch  1108
loss 1108 0.35594425
dev f1 1108 0.8254252461951656
dev acc 1108 0.8284
train f1 1108 0.9573285342931414
train acc 1108 0.95732
epoch  1109
loss 1109 0.35596976
dev f1 1109 0.831351265065049
dev acc 1109 0.83096
train f1 1109 0.9572902503399184
train acc 1109 0.95728
epoch  1110
loss 1110 0.35602376
dev f1 1110 0.8317925415230336
dev acc 1110 0.82824
train f1 1110 0.9572136916186821
train acc 1110 0.9

In [61]:
import tensorflow as tf
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

class BiLSTMAttentionEncoder():
    def __init__(self, cf_parser):
        self.cf_parser = cf_parser
        self.max_seq_len = int(cf_parser["network"]['max_seq_len'])
        self.embedding_dim = int(cf_parser["network"]['embedding_dim'])
        self.feature_num = int(cf_parser["network"]['feature_num'])
        self.num_classes = int(cf_parser["network"]["num_classes"])
        self.hidden_size = int(cf_parser["network"]["hidden_size"])
        self.learning_rate = float(cf_parser['network']['learning_rate'])
        self.epochs = int(cf_parser['train']['epochs'])
        self.batch_size = int(cf_parser['train']['batch_size'])
        self.dropout_proba_val = float(cf_parser['train']['dropout_proba_val'])
        self.dev_epoch = int(cf_parser['train']['dev_epoch'])
        self.graph = tf.Graph()
    
    def attention(self, H):
            hidden_size = 2 * self.hidden_size
            self.attention_w = tf.Variable(tf.random_uniform([hidden_size],
                                                            minval = 0,
                                                            maxval = 1),
                                          name = "attention_weight")
            self.step_h = tf.reshape(tf.tanh(H), [-1, hidden_size])
            self.score = tf.matmul(self.step_h, tf.reshape(self.attention_w,
                                                          [-1, 1]))
            self.score = tf.reshape(self.score, [-1, self.max_seq_len])
            self.alpha = tf.nn.softmax(self.score)
            self.weight_sum_output = tf.matmul(
                tf.transpose(H, [0, 2, 1]),
                tf.reshape(self.alpha, [-1, self.max_seq_len, 1]))
            self.weight_sum_output = tf.tanh(tf.reshape(self.weight_sum_output,
                                               [-1, hidden_size]))
            attentioned_output = tf.nn.dropout(
                self.weight_sum_output, self.dropout_proba)
            return attentioned_output
    
    def build(self):
        with tf.name_scope("bilstm_attention"), self.graph.as_default():
            with tf.name_scope("input"):
                self.input_x = tf.placeholder(dtype = tf.int32, 
                                             shape = [None, self.max_seq_len],
                                             name = 'input_x')
                self.input_y = tf.placeholder(dtype = tf.int32,
                                             shape = [None, self.num_classes],
                                             name = 'input_y')
                self.dropout_proba = tf.placeholder(tf.float32)

            with tf.name_scope("embedding"), tf.device("/cpu:0"):
                self.embedding_matrix = tf.Variable(tf.random_uniform(
                                                        dtype = tf.float32,
                                                        shape = [self.feature_num, self.embedding_dim],
                                                        minval = 0, maxval = 1),
                                                   name = "embedding_matrix")
                self.embedded = tf.nn.embedding_lookup(self.embedding_matrix,
                                                      self.input_x)
            with tf.name_scope("biLSTM"):
                fwlstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                    tf.nn.rnn_cell.LSTMCell(num_units = self.hidden_size,
                                            state_is_tuple = True),
                    output_keep_prob = self.dropout_proba)
                bwlstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                    tf.nn.rnn_cell.LSTMCell(num_units = self.hidden_size,
                                            state_is_tuple = True),
                    output_keep_prob = self.dropout_proba)
                output, state = tf.nn.bidirectional_dynamic_rnn(
                    fwlstm_cell, bwlstm_cell, self.embedded,
                    dtype = tf.float32, 
                    scope = "biLSTM")
                self.bilstm_final_output = tf.concat([output[0][:, -1, :], 
                                                output[1][:, 0, :]], -1, 
                                               name = "bilstm_output")
                self.bilstm_output = tf.concat([output[0],
                                                tf.reverse(output[1], [1])],
                                               -1)
                self.attention_output = self.attention(self.bilstm_output)
                print('at', self.attention_output.shape)
            with tf.name_scope("output"):
                self.bilstm_output_dim = 2 * self.hidden_size
                print(self.bilstm_output_dim, self.bilstm_output.shape)
                self.output_w = tf.Variable(tf.truncated_normal(
                    [self.bilstm_output_dim, self.num_classes], stddev = 0.1))
                self.output_bias = tf.Variable(
                    tf.truncated_normal([self.num_classes], stddev = 0.1))
                self.logits = tf.nn.xw_plus_b(self.bilstm_final_output,
                                             self.output_w, self.output_bias,
                                             name = 'logit')
                self.logits = tf.nn.sigmoid(self.logits)
                self.prediction = tf.argmax(self.logits, axis = 1, name = 
                                           'prediction')
                print(self.prediction.shape)

            with tf.name_scope("loss"):
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                        logits = self.logits, labels = self.input_y)

                self.loss = tf.reduce_mean(self.loss)
                print(self.loss.shape)
                
            with tf.name_scope("target"):
                self.correct = tf.equal(self.prediction, tf.arg_max(self.input_y, 1), name = "correct")
                self.acc = tf.reduce_mean(tf.cast(self.correct, tf.float32), name = "acc")

            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            self.train_step = self.optimizer.minimize(self.loss)

        

    def fit(self, train_x, train_y, dev_x  = None, dev_y = None, test_x = None):
        self.train_x = train_x
        self.train_y = train_y
        self.dev_x = dev_x
        self.dev_y = dev_y
        self.test_x = test_x
        self.train_m = train_x.shape[0]


    def train(self):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        with tf.Session(config = config, graph = self.graph) as self.sess:
            all_train_feed_dict = {
                self.input_x : self.train_x,
                self.input_y : self.train_y,
                self.dropout_proba : 1.0
            }
            self.sess.run(tf.global_variables_initializer())
            n_batch = int(np.ceil(self.train_m / self.batch_size))
            print(n_batch)
            for epoch in range(self.epochs):
                for batch_index in range(n_batch):
                    batch_x = self.train_x[batch_index * self.batch_size : 
                                           self.batch_size * (batch_index + 1)]
                    batch_y = self.train_y[batch_index * self.batch_size : 
                                           self.batch_size * (batch_index + 1)]
                    train_feed_dict = {
                        self.input_x : batch_x,
                        self.input_y : batch_y,
                        self.dropout_proba : self.dropout_proba_val
                    }
                    self.sess.run(self.train_step, 
                                 feed_dict = train_feed_dict)                    
                if(self.dev_x is not None and self.dev_y is not None 
                   and epoch % self.dev_epoch == 0):
                    dev_feed_dict = {
                        self.input_x : self.dev_x,
                        self.input_y : self.dev_y,
                        self.dropout_proba : 1.0
                    }
                    print('epoch ', epoch)
                    print('loss', epoch, self.sess.run(self.loss, 
                                               feed_dict = all_train_feed_dict))
                    p = self.sess.run(self.prediction,
                                     feed_dict = dev_feed_dict)
                    print('dev f1', epoch, f1_score(p, np.argmax(self.dev_y, axis = 1)))
                    print('dev acc', epoch, self.acc.eval(feed_dict = dev_feed_dict))
                    
                    train_p = self.sess.run(self.prediction,
                                     feed_dict = all_train_feed_dict)
                    print('train f1', epoch, f1_score(train_p, np.argmax(self.train_y, axis = 1)))
                    print('train acc', epoch, self.acc.eval(feed_dict = all_train_feed_dict))
ba = BiLSTMAttentionEncoder(cf)
ba.build()
ba.fit(train_x, train_y, dev_x, dev_y)
ba.train()

gg
(?, 100, 128) (?, 128)
(?, 100)
(?, 100)
Tensor("biLSTM/Reshape_3:0", shape=(?, 100, 1), dtype=float32)
(?, 128, 1)
(?, 128)
tt (?, 128)
at (?, 128)
128 (?, 100, 128)
(?,)
()
782
epoch  0
loss 0 0.6382714
dev f1 0 0.4710376987471207
dev acc 0 0.6234
train f1 0 0.4985676571364377
train acc 0 0.64292
epoch  1
loss 1 0.5227395
dev f1 1 0.7767509901166731
dev acc 1 0.74972
train f1 1 0.8047874833267242
train acc 1 0.7834
epoch  2
loss 2 0.48147577
dev f1 2 0.7977159065462417
dev acc 2 0.7832
train f1 2 0.8420135600924207
train acc 2 0.83316
epoch  3
loss 3 0.43446606
dev f1 3 0.8193995745016153
dev acc 3 0.81664
train f1 3 0.8831086690121358
train acc 3 0.88172
epoch  4
loss 4 0.41321597
dev f1 4 0.8246883302677294
dev acc 4 0.82844
train f1 4 0.8971734892787524
train acc 4 0.89872
epoch  5
loss 5 0.39711747
dev f1 5 0.8322230957011049
dev acc 5 0.8378
train f1 5 0.912631664897526
train acc 5 0.9144
epoch  6
loss 6 0.3912948
dev f1 6 0.826909183630105
dev acc 6 0.83708
train f1 6 0.9179