In [1]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.WARN)
import pickle
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import os
from tensorflow.python.client import device_lib
from collections import Counter
import time

In [2]:
f = open('../../Glove/word_embedding_glove', 'rb')
word_embedding = pickle.load(f)
f.close()

word_embedding = word_embedding[: len(word_embedding)-1]

f = open('../../Glove/vocab_glove', 'rb')
vocab = pickle.load(f)
f.close()

word2id = dict((w, i) for i,w in enumerate(vocab))
id2word = dict((i, w) for i,w in enumerate(vocab))

unknown_token = "UNKNOWN_TOKEN"

# Model Description
model_name = 'model-aw-1-multigpu-1'
model_dir = '../output/' + model_name
save_dir = os.path.join(model_dir, "save/")
log_dir = os.path.join(model_dir, "log")

if not os.path.exists(model_dir):
    os.mkdir(model_dir)
if not os.path.exists(save_dir):
    os.mkdir(save_dir)
if not os.path.exists(log_dir):
    os.mkdir(log_dir)

with open('/data/aviraj/dataset/train_data','rb') as f:
    train_data=pickle.load(f)
    
with open('/data/aviraj/dataset/val_data','rb') as f:
    val_data=pickle.load(f)
    

with open('/data/aviraj/dataset/fulldata_vocab_sense','rb') as f:
    vocab_sense=pickle.load(f)

    
sense2id = dict((s, i) for i,s in enumerate(vocab_sense))
id2sense = dict((i, s) for i,s in enumerate(vocab))

len(vocab_sense)

# Parameters
mode = 'train'
num_senses = len(vocab_sense)
batch_size = 32
vocab_size = len(vocab)
unk_vocab_size = 1
word_emb_size = len(word_embedding[0])
max_sent_size = 200
hidden_size = 200
keep_prob = 0.5
l2_lambda = 0.001
init_lr = 0.005
decay_steps = 500
decay_rate = 0.96
clip_norm = 1
clipping = True
moving_avg_deacy = 0.999
num_gpus = 12

In [3]:
def average_gradients(tower_grads):
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(grads, 0)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads

In [4]:
# MODEL
device_num = 0
tower_grads = []
losses = []
predictions = []

x = tf.placeholder('int32', [num_gpus, batch_size, max_sent_size], name="x")
y = tf.placeholder('int32', [num_gpus, batch_size, max_sent_size], name="y")
x_mask  = tf.placeholder('bool', [num_gpus, batch_size, max_sent_size], name='x_mask') 
sense_mask  = tf.placeholder('bool', [num_gpus, batch_size, max_sent_size], name='sense_mask')
is_train = tf.placeholder('bool', [], name='is_train')
word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')
input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))

global_step = tf.Variable(0, trainable=False, name="global_step")
learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)

with tf.variable_scope("word_embedding"):
    unk_word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))
    final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)

with tf.variable_scope(tf.get_variable_scope()):
    for gpu_idx in range(num_gpus):
        if gpu_idx>6:
            device_num = 1
        with tf.name_scope("model_{}".format(gpu_idx)) as scope, tf.device('/gpu:%d' % device_num):

            if gpu_idx > 0:
                    tf.get_variable_scope().reuse_variables()

            with tf.name_scope("word"):
                Wx = tf.nn.embedding_lookup(final_word_emb_mat, x[gpu_idx])  

            x_len = tf.reduce_sum(tf.cast(x_mask[gpu_idx], 'int32'), 1)

            with tf.variable_scope("lstm1"):
                cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)
                cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)

                d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)
                d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)

                (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')
                h1 = tf.concat([fw_h1, bw_h1], 2)

            with tf.variable_scope("lstm2"):
                cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)
                cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)

                d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)
                d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)

                (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')
                h = tf.concat([fw_h2, bw_h2], 2)

            def attention(input_x, input_mask, W_att):
                h_masked = tf.boolean_mask(input_x, input_mask)
                h_tanh = tf.tanh(h_masked)
                u = tf.matmul(h_tanh, W_att)
                a = tf.nn.softmax(u)
                c = tf.reduce_sum(tf.multiply(h_tanh, a), 0)  
                return c

            with tf.variable_scope("attention"):
                W_att = tf.get_variable("W_att", shape=[2*hidden_size, 1], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=0))
                c = tf.expand_dims(attention(h[0], x_mask[gpu_idx][0], W_att), 0)
                for i in range(1, batch_size):
                    c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[gpu_idx][i], W_att), 0)], 0)
                
                cc = tf.expand_dims(c, 1)
                c_final = tf.tile(cc, [1, max_sent_size, 1])
                h_final = tf.concat([c_final, h],2)
                flat_h_final = tf.reshape(h_final, [-1, 4*hidden_size])
           
            with tf.variable_scope("softmax_layer"):
                W = tf.get_variable("W", shape=[4*hidden_size, num_senses], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=0))
                b = tf.get_variable("b", shape=[num_senses], initializer=tf.zeros_initializer())
                drop_flat_h_final = tf.nn.dropout(flat_h_final, input_keep_prob)
                flat_logits_sense = tf.matmul(drop_flat_h_final, W) + b
                logits = tf.reshape(flat_logits_sense, [batch_size, max_sent_size, num_senses])
                predictions.append(tf.arg_max(logits, 2))
    

            float_sense_mask = tf.cast(sense_mask[gpu_idx], 'float')
 
            loss = tf.contrib.seq2seq.sequence_loss(logits, y[gpu_idx], float_sense_mask, name="loss")
            l2_loss = l2_lambda * tf.losses.get_regularization_loss()

            total_loss = loss + l2_loss
            tf.summary.scalar("loss_{}".format(gpu_idx), total_loss)

            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

            optimizer = tf.train.AdamOptimizer(learning_rate)
            grads_vars = optimizer.compute_gradients(total_loss)

            clipped_grads = grads_vars
            if(clipping == True):
                clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]

            tower_grads.append(clipped_grads)
            losses.append(total_loss)

tower_grads = average_gradients(tower_grads)
losses = tf.add_n(losses)/len(losses)
apply_grad_op = optimizer.apply_gradients(tower_grads, global_step=global_step)
summaries.append(tf.summary.scalar('total_loss', losses))
summaries.append(tf.summary.scalar('learning_rate', learning_rate))

for var in tf.trainable_variables():
    summaries.append(tf.summary.histogram(var.op.name, var))

variable_averages = tf.train.ExponentialMovingAverage(moving_avg_deacy, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())

train_op = tf.group(apply_grad_op, variables_averages_op)
saver = tf.train.Saver(tf.global_variables())
summary = tf.summary.merge(summaries)

In [5]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2,3"
# print (device_lib.list_local_devices())
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())                          # For initializing all the variables
summary_writer = tf.summary.FileWriter(log_dir, sess.graph)          # For writing Summaries

In [6]:
def model(xx, yy, mask, smask, train_cond=True):
    num_batches = int(len(xx)/(batch_size*num_gpus))
    _losses = 0
    preds_sense = []
    true_sense = []
    
    for j in range(num_batches): 
        
        s = j * batch_size * num_gpus
        e = (j+1) * batch_size * num_gpus
        xx_re = xx[s:e].reshape([num_gpus, batch_size, -1])
        yy_re = yy[s:e].reshape([num_gpus, batch_size, -1])
        mask_re = mask[s:e].reshape([num_gpus, batch_size, -1])
        smask_re = smask[s:e].reshape([num_gpus, batch_size, -1])
 
        feed_dict = {x:xx_re, y:yy_re, x_mask:mask_re, sense_mask:smask_re, is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}
        
        
        if(train_cond==True):
            _, _loss, step, _summary = sess.run([train_op, losses, global_step, summary], feed_dict)
            summary_writer.add_summary(_summary, step)
            
            if((j+1)%100==0):
                print("Steps: {}".format(step), ", Loss: {0:.4f}".format(_loss))

        else:
            _loss, pred = sess.run([total_loss, predictions], feed_dict)
            for i in range(num_gpus):
                preds_sense.append(pred[i][smask_re[i]])
                true_sense.append(yy_re[i][smask_re[i]])
        

        _losses +=_loss

    if(train_cond==False): 
        sense_preds = []
        sense_true = []
        for preds in preds_sense:
            for ps in preds:      
                sense_preds.append(ps)
                
        for trues in true_sense:
            for ts in trues:
                sense_true.append(ts)
        return _losses/num_batches, sense_preds, sense_true 

    return _losses/num_batches, step

def eval_score(yy, pred):
    f1 = f1_score(yy, pred, average='macro')
    accu = accuracy_score(yy, pred)
    return f1*100, accu*100

In [7]:
x_id_train, mask_train, sense_mask_train, y_train = train_data
x_id_val, mask_val, sense_mask_val, y_val = val_data

In [None]:
num_epochs = 2
log_period = 1

for i in range(num_epochs):
    random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)
    x_id_train = x_id_train[random]
    y_train = y_train[random]
    mask_train = mask_train[random]    
    sense_mask_train = sense_mask_train[random]
    
    start_time = time.time()
    train_loss, step = model(x_id_train, y_train, mask_train, sense_mask_train)
    time_taken = time.time() - start_time
    print("Epoch: {}".format(i+1),", Step: {}".format(step), ", loss: {0:.4f}".format(train_loss), ", Time: {0:.4f}".format(time_taken))
    
    if((i+1)%log_period==0):
        saver.save(sess, save_path=save_dir)                         
        print("Model Saved")
        
        start_time = time.time()
        val_loss, val_pred, val_true = model(x_id_val, y_val, mask_val, sense_mask_val, train_cond=False)        
        f1_, accu_= eval_score(val_true, val_pred)
        time_taken = time.time() - start_time
        print("Val: F1 Score:{0:.4f}".format(f1_), "Accuracy:{0:.4f}".format(accu_), "Loss:{0:.4f}".format(val_loss), ", Time: {0:.4f}".format(time_taken))
    

Steps: 100 , Loss: 3.0485
Steps: 200 , Loss: 2.2137
Steps: 300 , Loss: 1.8979
Steps: 400 , Loss: 1.6761
Steps: 500 , Loss: 1.5920
Steps: 600 , Loss: 1.5361
Steps: 700 , Loss: 1.4590
Steps: 800 , Loss: 1.4345
Steps: 900 , Loss: 1.2928
Steps: 1000 , Loss: 1.4597
Steps: 1100 , Loss: 1.4113
Steps: 1200 , Loss: 1.3651
Steps: 1300 , Loss: 1.3343
Steps: 1400 , Loss: 1.2913
Steps: 1500 , Loss: 1.3164
Steps: 1600 , Loss: 1.3258
Steps: 1700 , Loss: 1.2264
Epoch: 1 , Step: 1771 , loss: 1.8143 , Time: 14138.6841
Model Saved


  'precision', 'predicted', average, warn_for)


Val: F1 Score:51.3272 Accuracy:60.5387 Loss:1.2050 , Time: 2425.1928
Steps: 1871 , Loss: 1.2054
Steps: 1971 , Loss: 1.1874
Steps: 2071 , Loss: 1.2680
Steps: 2171 , Loss: 1.2125
Steps: 2271 , Loss: 1.2071
Steps: 2371 , Loss: 1.2205
Steps: 2471 , Loss: 1.1439
Steps: 2571 , Loss: 1.0482
Steps: 2671 , Loss: 1.2246
Steps: 2771 , Loss: 1.1670
Steps: 2871 , Loss: 1.2029
Steps: 2971 , Loss: 1.0693
Steps: 3071 , Loss: 1.1728
Steps: 3171 , Loss: 1.1016
Steps: 3271 , Loss: 1.2222
Steps: 3371 , Loss: 0.9987
Steps: 3471 , Loss: 1.0802
Epoch: 2 , Step: 3542 , loss: 1.1549 , Time: 14065.3056
Model Saved
Val: F1 Score:55.4466 Accuracy:63.9468 Loss:1.0955 , Time: 2423.6312


In [None]:
num_epochs = 10
log_period = 2

for i in range(num_epochs):
    random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)
    x_id_train = x_id_train[random]
    y_train = y_train[random]
    mask_train = mask_train[random]    
    sense_mask_train = sense_mask_train[random]
    
    start_time = time.time()
    train_loss, step = model(x_id_train, y_train, mask_train, sense_mask_train)
    time_taken = time.time() - start_time
    print("Epoch: {}".format(i+1),", Step: {}".format(step), ", loss: {0:.4f}".format(train_loss), ", Time: {0:.4f}".format(time_taken))
    
    if((i+1)%log_period==0):
        saver.save(sess, save_path=save_dir)                         
        print("Model Saved")
        
        start_time = time.time()
        val_loss, val_pred, val_true = model(x_id_val, y_val, mask_val, sense_mask_val, train_cond=False)        
        f1_, accu_= eval_score(val_true, val_pred)
        time_taken = time.time() - start_time
        print("Val: F1 Score:{0:.4f}".format(f1_), "Accuracy:{0:.4f}".format(accu_), "Loss:{0:.4f}".format(val_loss), ", Time: {0:.4f}".format(time_taken))
    

Steps: 3642 , Loss: 1.1003
Steps: 3742 , Loss: 1.0612
Steps: 3842 , Loss: 1.0594
Steps: 3942 , Loss: 0.9088
Steps: 4042 , Loss: 1.0412
Steps: 4142 , Loss: 1.0980
Steps: 4242 , Loss: 1.0316
Steps: 4342 , Loss: 0.9793
Steps: 4442 , Loss: 0.9439
Steps: 4542 , Loss: 1.0453
Steps: 4642 , Loss: 0.9672
Steps: 4742 , Loss: 0.9784
Steps: 4842 , Loss: 0.9913
Steps: 4942 , Loss: 1.0337
Steps: 5042 , Loss: 0.9985
Steps: 5142 , Loss: 1.0390
Steps: 5242 , Loss: 1.1115
Epoch: 1 , Step: 5313 , loss: 1.0618 , Time: 14066.7044
Steps: 5413 , Loss: 1.0690
Steps: 5513 , Loss: 0.9906
Steps: 5613 , Loss: 1.2237
Steps: 5713 , Loss: 1.0799
Steps: 5813 , Loss: 0.9769
Steps: 5913 , Loss: 1.0619
Steps: 6013 , Loss: 1.0210
Steps: 6113 , Loss: 0.9732
Steps: 6213 , Loss: 0.9382
Steps: 6313 , Loss: 0.9956
Steps: 6413 , Loss: 1.0123
Steps: 6513 , Loss: 1.0839
Steps: 6613 , Loss: 0.9480
Steps: 6713 , Loss: 1.0051
Steps: 6813 , Loss: 1.0396
Steps: 6913 , Loss: 0.9396


In [9]:
num_epochs = 10
log_period = 2

for i in range(num_epochs):
    random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)
    x_id_train = x_id_train[random]
    y_train = y_train[random]
    mask_train = mask_train[random]    
    sense_mask_train = sense_mask_train[random]
    
    start_time = time.time()
    train_loss, step = model(x_id_train, y_train, mask_train, sense_mask_train)
    time_taken = time.time() - start_time
    print("Epoch: {}".format(i+1),", Step: {}".format(step), ", loss: {0:.4f}".format(train_loss), ", Time: {0:.4f}".format(time_taken))
    
    if((i+1)%log_period==0):
        saver.save(sess, save_path=save_dir)                         
        print("Model Saved")
        
        start_time = time.time()
        val_loss, val_pred, val_true = model(x_id_val, y_val, mask_val, sense_mask_val, train_cond=False)        
        f1_, accu_= eval_score(val_true, val_pred)
        time_taken = time.time() - start_time
        print("Val: F1 Score:{0:.4f}".format(f1_), "Accuracy:{0:.4f}".format(accu_), "Loss:{0:.4f}".format(val_loss), ", Time: {0:.4f}".format(time_taken))
    

Steps: 7184 , Loss: 0.9888
Steps: 7284 , Loss: 0.9900
Steps: 7384 , Loss: 0.8940
Steps: 7484 , Loss: 1.0504
Steps: 7584 , Loss: 1.0108
Steps: 7684 , Loss: 0.9642
Steps: 7784 , Loss: 0.8625
Steps: 7884 , Loss: 1.0234
Steps: 7984 , Loss: 0.8952
Steps: 8084 , Loss: 1.0351
Steps: 8184 , Loss: 0.9888
Steps: 8284 , Loss: 0.8881
Steps: 8384 , Loss: 1.0352
Steps: 8484 , Loss: 1.0123
Steps: 8584 , Loss: 1.0111
Steps: 8684 , Loss: 1.0296
Steps: 8784 , Loss: 0.8988
Epoch: 1 , Step: 8855 , loss: 0.9601 , Time: 14310.6745
Steps: 8955 , Loss: 0.8639
Steps: 9055 , Loss: 0.9848
Steps: 9155 , Loss: 0.9639
Steps: 9255 , Loss: 0.8563
Steps: 9355 , Loss: 0.9502
Steps: 9455 , Loss: 0.9274
Steps: 9555 , Loss: 1.0012
Steps: 9655 , Loss: 1.0142
Steps: 9755 , Loss: 0.9328
Steps: 9855 , Loss: 0.8996
Steps: 9955 , Loss: 0.9454
Steps: 10055 , Loss: 0.9533
Steps: 10155 , Loss: 0.9255
Steps: 10255 , Loss: 0.9105
Steps: 10355 , Loss: 1.0548
Steps: 10455 , Loss: 1.0310
Steps: 10555 , Loss: 0.9779
Epoch: 2 , Step: 106

  'precision', 'predicted', average, warn_for)


Val: F1 Score:61.2510 Accuracy:69.0362 Loss:0.9079 , Time: 2433.7403


In [None]:
num_epochs = 10
log_period = 2

for i in range(num_epochs):
    random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)
    x_id_train = x_id_train[random]
    y_train = y_train[random]
    mask_train = mask_train[random]    
    sense_mask_train = sense_mask_train[random]
    
    start_time = time.time()
    train_loss, step = model(x_id_train, y_train, mask_train, sense_mask_train)
    time_taken = time.time() - start_time
    print("Epoch: {}".format(i+1),", Step: {}".format(step), ", loss: {0:.4f}".format(train_loss), ", Time: {0:.4f}".format(time_taken))
    
    if((i+1)%log_period==0):
        saver.save(sess, save_path=save_dir)                         
        print("Model Saved")
        
        start_time = time.time()
        val_loss, val_pred, val_true = model(x_id_val, y_val, mask_val, sense_mask_val, train_cond=False)        
        f1_, accu_= eval_score(val_true, val_pred)
        time_taken = time.time() - start_time
        print("Val: F1 Score:{0:.4f}".format(f1_), "Accuracy:{0:.4f}".format(accu_), "Loss:{0:.4f}".format(val_loss), ", Time: {0:.4f}".format(time_taken))
    

Steps: 10726 , Loss: 0.9620
Steps: 10826 , Loss: 0.9219
Steps: 10926 , Loss: 0.9762
Steps: 11026 , Loss: 0.9036
Steps: 11126 , Loss: 0.9085
Steps: 11226 , Loss: 0.8913
Steps: 11326 , Loss: 1.0364
Steps: 11426 , Loss: 0.9325
Steps: 11526 , Loss: 1.0462
Steps: 11626 , Loss: 0.8649
Steps: 11726 , Loss: 0.8723
Steps: 11826 , Loss: 0.9361
Steps: 11926 , Loss: 0.7772
Steps: 12026 , Loss: 0.8808
Steps: 12126 , Loss: 0.8480
Steps: 12226 , Loss: 0.9181
Steps: 12326 , Loss: 0.7882
Epoch: 1 , Step: 12397 , loss: 0.9000 , Time: 14521.2565
Steps: 12497 , Loss: 0.8247


In [8]:
saver.restore(sess, save_dir)