In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import numpy as np
import tensorflow as tf
from utils import getWordEmbedding

#parser = argparse.ArgumentParser()
#parser.add_argument('w2v_model', type=str)
#args = parser.parse_args()
#
#w2v_path = args.w2v_model
w2v_path = 'word2vec_model/blogwiki_size200_alpha01_iter20.model'
traindata_path = 'data/data_train.txt'
testdata_path = 'data/data_test.txt'


In [2]:
(word2id, id2word, embedding_matrix) = getWordEmbedding(w2v_path)
wordemb_dim = embedding_matrix.shape[1]


print('number of vocab = %d' %(len(word2id)), flush=True)
print('embedding_dim = %d' %(wordemb_dim), flush=True)

print(word2id['蘋果'])
print(word2id['香蕉'])
print(np.linalg.norm(embedding_matrix[word2id['蘋果']] - embedding_matrix[word2id['香蕉']]))
print(np.linalg.norm(embedding_matrix[word2id['蘋果']] - embedding_matrix[word2id['鳳梨']]))
print(np.linalg.norm(embedding_matrix[word2id['蘋果']] - embedding_matrix[word2id['天氣']]))
print(np.linalg.norm(embedding_matrix[word2id['攝氏']] - embedding_matrix[word2id['蘋果']]))

number of vocab = 675933
embedding_dim = 200
2410
6604
29.939506664653464
29.177828479926113
53.53760289456816
51.288254315522046


In [3]:
def getData(path, w2id):
    x = []
    y = []
    fp = open(path, 'r', encoding='utf8')
    for line in fp:
        line_split = line.strip().split('\t')
        y.append(int(line_split[0]))
        x.append(line_split[1:])
    fp.close()
    y = np.array(y)
    # one hot
    n_values = np.max(y) + 1
    y = np.eye(n_values)[y]
    # word to word_id
    x = [[w2id[word] for word in _ if word in w2id] for _ in x]
    return (x, y)

train_x_id, train_y = getData(traindata_path, word2id)
test_x_id, test_y = getData(testdata_path, word2id)

print(train_x_id[0:10])
print(train_y[0:10])

[[1279, 909, 4268], [3, 1856, 148, 683], [193, 886], [1450, 205, 148, 3973, 342, 44], [178, 329, 571, 4354], [8714, 24889, 0, 6823, 880, 636], [55, 31, 22, 1087], [14956, 2, 10107], [713, 95838], [1450, 205, 5, 2651, 240, 44]]
[[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]]


In [4]:
def _getSentVec(x, embedding_matrix):
    dim = embedding_matrix.shape[1]
    v_sum = np.zeros(dim)
    for id in x:
        v_sum += embedding_matrix[id]
    return v_sum/len(x)

train_x = np.array([_getSentVec(x, embedding_matrix) for x in train_x_id])
test_x = np.array([_getSentVec(x, embedding_matrix) for x in test_x_id])
train_data_cnt = train_x.shape[0]
#print(train_x[0])

In [5]:
print(np.linalg.norm(train_x[20] - train_x[25]))
print(np.linalg.norm(train_x[20] - train_x[34]))
print(np.linalg.norm(train_x[0] - train_x[3]))
print(np.linalg.norm(train_x[0] - train_x[4]))
print(np.linalg.norm(train_x[0] - train_x[5]))
print("=========================================")
print(np.linalg.norm(train_x[20] - train_x[18]))
print(np.linalg.norm(train_x[20] - train_x[33]))
print(np.linalg.norm(train_x[20] - train_x[913]))
print(np.linalg.norm(train_x[1081] - train_x[3]))
print(np.linalg.norm(train_x[1081] - train_x[4]))

18.923957978776286
16.2601450869681
28.386265530580374
29.780533865659823
26.37557881399238
23.77293741881347
28.426846818552363
15.943822633920009
20.62106352144115
22.406443381184456


In [7]:
STDDEV = 0.1
BATCH_SIZE = 16
TRAINING_EPOCHS = 1000
LEARNING_RATE = 0.000001
H1_SIZE = 200
H2_SIZE = 200
H3_SIZE = 200

X = tf.placeholder(tf.float32, [None, wordemb_dim])
y = tf.placeholder(tf.float32, [None, 2])
dropout_keep_prob = tf.placeholder(tf.float32)

def mlp(_X, _weights, _biases, dropout_keep_prob):
    layer1 = tf.nn.dropout(tf.nn.tanh(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1'])), dropout_keep_prob)
    layer2 = tf.nn.dropout(tf.nn.tanh(tf.add(tf.matmul(layer1, _weights['h2']), _biases['b2'])), dropout_keep_prob)
    layer3 = tf.nn.dropout(tf.nn.tanh(tf.add(tf.matmul(layer2, _weights['h3']), _biases['b3'])), dropout_keep_prob)
    out = tf.nn.tanh(tf.add(tf.matmul(layer3, _weights['out']), _biases['out']))
    return out

weights = {
    'h1': tf.Variable(tf.random_normal([wordemb_dim, H1_SIZE],stddev=STDDEV)),
    'h2': tf.Variable(tf.random_normal([H1_SIZE, H2_SIZE],stddev=STDDEV)),
    'h3': tf.Variable(tf.random_normal([H2_SIZE, H3_SIZE],stddev=STDDEV)),
    'out': tf.Variable(tf.random_normal([H3_SIZE, 2],stddev=STDDEV)),                                   
}

biases = {
    'b1': tf.Variable(tf.random_normal([H1_SIZE])),
    'b2': tf.Variable(tf.random_normal([H2_SIZE])),
    'b3': tf.Variable(tf.random_normal([H3_SIZE])),
    'out': tf.Variable(tf.random_normal([2]))
}


pred = mlp(X, weights, biases, dropout_keep_prob)

# Loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=pred))
optimizer = tf.train.AdamOptimizer(learning_rate = LEARNING_RATE).minimize(cost)

# Accuracy
predict_label = tf.argmax(pred, 1)
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))



total_parameters = 0
#iterating over all variables
for variable in tf.trainable_variables():  
    local_parameters=1
    shape = variable.get_shape()  #getting shape of a variable
    for i in shape:
        local_parameters*=i.value  #mutiplying dimension values
    total_parameters+=local_parameters
print('total_parameters = %d' %(total_parameters))


init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for epoch in range(TRAINING_EPOCHS):
    epoch += 1
    avg_cost = 0.
    total_batch = int(train_data_cnt / BATCH_SIZE)
    # Loop over all batches
    for i in range(total_batch):
        idx_start = i*BATCH_SIZE
        idx_end = i*BATCH_SIZE + BATCH_SIZE
        if(idx_end >= train_data_cnt):
            idx_end = train_data_cnt
        batch_xs = train_x[idx_start:idx_end, :]
        batch_ys = train_y[idx_start:idx_end, :]
        # Fit training using batch data
        sess.run(optimizer, feed_dict={X: batch_xs, y: batch_ys, dropout_keep_prob: 0.9})
        # Compute average loss
        avg_cost += sess.run(cost, feed_dict={X: batch_xs, y: batch_ys, dropout_keep_prob:1.})/total_batch
    # Display logs per epoch step
    if epoch % 10 == 0 or epoch == (TRAINING_EPOCHS):
        print ("Epoch: %03d/%03d cost: %.15f" % (epoch, TRAINING_EPOCHS, avg_cost))
        train_acc = sess.run(accuracy, feed_dict={X: train_x, y: train_y, dropout_keep_prob:1.})
        [test_acc, test_pred] = sess.run([accuracy, predict_label], feed_dict={X: test_x, y: test_y, dropout_keep_prob:1.})
        print ("Training accuracy: %.3f" % (train_acc))
        print ("Test accuracy: %.3f" % (test_acc))
        print('')
        
        # write test result
        fp = open('test_epoch{}.txt'.format(epoch), 'w', encoding='utf8')
        fp.write('target, predict, sentence\n')
        for i in range(len(test_y)):
            fp.write('{}, {}, {}\n'.format(np.argmax(test_y[i]), test_pred[i], ''.join([id2word[w] for w in test_x_id[i]])))
        fp.close()
        


print ("End of training.\n")

total_parameters = 242004
Epoch: 010/1000 cost: 0.470147437020524
Training accuracy: 0.805
Test accuracy: 0.740

Epoch: 020/1000 cost: 0.417251051288761
Training accuracy: 0.809
Test accuracy: 0.760

Epoch: 030/1000 cost: 0.382682934607545
Training accuracy: 0.815
Test accuracy: 0.760

Epoch: 040/1000 cost: 0.353547553085301
Training accuracy: 0.825
Test accuracy: 0.760

Epoch: 050/1000 cost: 0.329386675806895
Training accuracy: 0.835
Test accuracy: 0.800

Epoch: 060/1000 cost: 0.308738411493497
Training accuracy: 0.860
Test accuracy: 0.860

Epoch: 070/1000 cost: 0.290122000730201
Training accuracy: 0.894
Test accuracy: 0.870

Epoch: 080/1000 cost: 0.269885666362227
Training accuracy: 0.917
Test accuracy: 0.880

Epoch: 090/1000 cost: 0.248735563599900
Training accuracy: 0.931
Test accuracy: 0.900

Epoch: 100/1000 cost: 0.228889254674520
Training accuracy: 0.950
Test accuracy: 0.910

Epoch: 110/1000 cost: 0.213220237664980
Training accuracy: 0.962
Test accuracy: 0.930

Epoch: 120/1000 c

Epoch: 950/1000 cost: 0.129856348445971
Training accuracy: 0.998
Test accuracy: 0.990

Epoch: 960/1000 cost: 0.129795667039205
Training accuracy: 0.999
Test accuracy: 0.990

Epoch: 970/1000 cost: 0.129758075287897
Training accuracy: 0.999
Test accuracy: 0.990

Epoch: 980/1000 cost: 0.129697406332787
Training accuracy: 0.999
Test accuracy: 0.990

Epoch: 990/1000 cost: 0.129656375679251
Training accuracy: 0.999
Test accuracy: 0.990

Epoch: 1000/1000 cost: 0.129614419316592
Training accuracy: 0.999
Test accuracy: 0.990

End of training.

