In [38]:
import sys, os, _pickle as pickle
import tensorflow as tf
import numpy as np
import nltk
from itertools import chain

In [2]:
data_dir = 'data'
ckpt_dir = 'checkpoint'
word_embd_dir = 'checkpoint/word_embd_100'
model_dir = 'checkpoint/model'

In [3]:
word_embd_dim = 100
pos_embd_dim = 25
dep_embd_dim = 25
word_vocab_size = 400001
pos_vocab_size = 10
dep_vocab_size = 10
relation_classes = 19
state_size = 100
batch_size = 10
channels = 3
lambda_l2 = 0.0001

In [4]:
with tf.name_scope("input"):
    sequence_length = tf.placeholder(tf.int32, shape=[batch_size], name="sequence_length")
    word_ids = tf.placeholder(tf.int32, shape=[batch_size, None], name="word_ids")
    pos_ids = tf.placeholder(tf.int32, [batch_size, None], name="pos_ids")
    dep_ids = tf.placeholder(tf.int32, [batch_size, None], name="dep_ids")
    y = tf.placeholder(tf.int32, [batch_size, None], name="y")

In [5]:
with tf.name_scope("word_embedding"):
    W = tf.Variable(tf.constant(0.0, shape=[word_vocab_size, word_embd_dim]), name="W")
    embedding_placeholder = tf.placeholder(tf.float32,[word_vocab_size, word_embd_dim])
    embedding_init = W.assign(embedding_placeholder)
    embedded_word = tf.nn.embedding_lookup(W, word_ids)
    word_embedding_saver = tf.train.Saver({"word_embedding/W": W})

In [6]:
with tf.name_scope("pos_embedding"):
    W = tf.Variable(tf.random_uniform([pos_vocab_size, pos_embd_dim]), name="W")
    embedded_pos = tf.nn.embedding_lookup(W, pos_ids)
    pos_embedding_saver = tf.train.Saver({"pos_embedding/W": W})

In [7]:
with tf.name_scope("dep_embedding"):
    W = tf.Variable(tf.random_uniform([dep_vocab_size, dep_embd_dim]), name="W")
    embedded_dep = tf.nn.embedding_lookup(W, dep_ids)
    dep_embedding_saver = tf.train.Saver({"dep_embedding/W": W})

In [8]:
hidden_states = tf.zeros([channels, batch_size, state_size], name='hidden_state')
cell_states = tf.zeros([channels, batch_size, state_size], name='cell_state')

In [9]:
init_states = [tf.contrib.rnn.LSTMStateTuple(hidden_states[i], cell_states[i]) for i in range(channels)]

In [10]:
with tf.variable_scope("word_lstm"):
    cell = tf.contrib.rnn.BasicLSTMCell(state_size)
    state_series, current_state = tf.nn.dynamic_rnn(cell, embedded_word, sequence_length=sequence_length, initial_state=init_states[0])
    state_series_word = tf.reduce_max(state_series, axis=1)

In [11]:
with tf.variable_scope("pos_lstm"):
    cell = tf.contrib.rnn.BasicLSTMCell(state_size)
    state_series, current_state = tf.nn.dynamic_rnn(cell, embedded_pos, sequence_length=sequence_length,initial_state=init_states[1])
    state_series_pos = tf.reduce_max(state_series, axis=1)

In [12]:
with tf.variable_scope("dep_lstm"):
    cell = tf.contrib.rnn.BasicLSTMCell(state_size)
    state_series, current_state = tf.nn.dynamic_rnn(cell, embedded_dep, sequence_length=sequence_length, initial_state=init_states[2])
    state_series_dep = tf.reduce_max(state_series, axis=1)

In [13]:
state_series = tf.concat([state_series_word, state_series_pos, state_series_dep], 1)

In [14]:
with tf.name_scope("hidden_layer"):
    W = tf.Variable(tf.truncated_normal([channels * state_size, relation_classes], -0.1, 0.1), name="W")
    b = tf.Variable(tf.zeros([relation_classes]), name="b")
    logits = tf.matmul(state_series, W) + b
    predictions = tf.argmax(logits, 1)

In [15]:
global_step = tf.Variable(0, name="global_step")

In [16]:
tv_all = tf.trainable_variables()
tv_regu = []
non_reg = ["word_embedding/W:0","pos_embedding/Variable:0","global_step:0"]
for t in tv_all:
    if t.name not in non_reg:
        tv_regu.append(t)

In [17]:
with tf.name_scope("loss"):
    l2_loss = lambda_l2 * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
    total_loss = loss + l2_loss

In [18]:
optimizer = tf.train.AdamOptimizer(0.001).minimize(total_loss, global_step=global_step)

In [19]:
sess = tf.Session()

In [20]:
sess.run(tf.global_variables_initializer())

In [21]:
# latest_embd = tf.train.latest_checkpoint(word_embd_dir)
# word_embedding_saver.restore(sess, latest_embd)

In [22]:
f = open('data/vocab.pkl', 'rb')
vocab = pickle.load(f)
f.close()

In [23]:
# f = open('data/word_embedding', 'rb')
# word_embedding = pickle.load(f)
# f.close()

In [24]:
word2id = dict((w, i) for i,w in enumerate(vocab))
id2word = dict((i, w) for i,w in enumerate(vocab))

In [25]:
unknown_token = "UNKNOWN_TOKEN"
word2id[unknown_token] = word_vocab_size
id2word[word_vocab_size-1] = unknown_token

In [236]:
f = open('train_paths', 'rb')
word_p1, word_p2, dep_p1, dep_p2, pos_p1, pos_p2 = pickle.load(f)
f.close()

In [77]:
num_batches = int(8000/batch_size)
num_batches

800

In [78]:
pos_tags_vocab = []
for line in open('data/pos_tags.txt'):
        pos_tags_vocab.append(line.strip())

In [144]:
pos_tags_vocab

['CC', 'CD', 'DT', 'IN', 'JJ', 'NN', 'PRP', 'RB', 'VB']

In [143]:
dep_vocab = []
for line in open('data/dependency_types.txt'):
    dep_vocab.append(line.strip())

In [147]:
dep_vocab, len(dep_vocab)

(['root',
  'nmod',
  'nsubj',
  'dobj',
  'nsubjpass',
  'compound',
  'conj',
  'acl',
  'advcl',
  'ccomp',
  'amod',
  'acl:relcl',
  'xcomp',
  'dep',
  'appos',
  'nmod:poss',
  'advmod',
  'parataxis',
  'csubj',
  'iobj'],
 20)

In [79]:
pos_tag2id = dict((w, i) for i,w in enumerate(pos_tags_vocab))
id2pos_tag = dict((i, w) for i,w in enumerate(pos_tags_vocab))

In [199]:
dep2id = dict((w, i) for i,w in enumerate(dep_vocab))
id2dep = dict((i, w) for i,w in enumerate(dep_vocab))

In [214]:
dep_p2

['dobj',
 'root',
 'root',
 'root',
 'root',
 'root',
 'root',
 ['nmod', 'root'],
 ['compound', 'nmod', 'root'],
 ['nmod', 'conj'],
 ['nmod', 'acl', 'nsubj'],
 ['nmod', 'root'],
 ['dobj', 'acl', 'dobj'],
 ['nmod', 'root'],
 ['amod', 'nmod', 'root'],
 ['nmod', 'acl', 'root'],
 ['nmod', 'appos', 'dobj', 'root'],
 ['conj', 'dobj', 'root'],
 ['nmod'],
 ['nmod', 'nmod'],
 ['nmod', 'root'],
 ['nmod', 'dobj'],
 ['dobj', 'root'],
 ['dobj', 'advcl', 'xcomp', 'root'],
 ['nmod', 'nmod'],
 ['nmod', 'root'],
 ['nmod', 'nmod', 'root'],
 ['nmod', 'dobj', 'root'],
 ['amod', 'nmod', 'acl', 'dobj'],
 ['nmod', 'nsubj'],
 ['compound', 'nmod', 'root'],
 ['nmod', 'root'],
 ['nmod'],
 ['nmod', 'advcl'],
 ['compound', 'nmod', 'nmod'],
 ['nmod', 'xcomp'],
 ['nmod', 'root'],
 ['nmod', 'root'],
 ['nmod', 'nmod', 'root'],
 ['nmod', 'nmod', 'nmod', 'dobj', 'root'],
 ['nsubj', 'ccomp', 'acl:relcl', 'root'],
 ['nmod', 'root'],
 ['nmod', 'root'],
 ['nmod', 'root'],
 ['compound', 'conj', 'nmod', 'root'],
 ['dobj', 'ro

In [80]:
pos_tag2id['OTH'] = 9
id2pos_tag[9] = 'OTH'

In [148]:
dep2id['OTH'] = 20
id2dep[20] = 'OTH'

In [81]:
JJ_pos_tags = ['JJ', 'JJR', 'JJS']
NN_pos_tags = ['NN', 'NNS', 'NNP', 'NNPS']
RB_pos_tags = ['RB', 'RBR', 'RBS']
PRP_pos_tags = ['PRP', 'PRP$']
VB_pos_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
_pos_tags = ['CC', 'CD', 'DT', 'IN']

In [82]:
 def pos_tag(x):
    if x in JJ_pos_tags:
        return pos_tag2id['JJ']
    if x in NN_pos_tags:
        return pos_tag2id['NN']
    if x in RB_pos_tags:
        return pos_tag2id['RB']
    if x in PRP_pos_tags:
        return pos_tag2id['PRP']
    if x in VB_pos_tags:
        return pos_tag2id['VB']
    if x in _pos_tags:
        return pos_tag2id[x]
    else:
        return 9

In [83]:
pos_tag2id

{'CC': 0,
 'CD': 1,
 'DT': 2,
 'IN': 3,
 'JJ': 4,
 'NN': 5,
 'OTH': 9,
 'PRP': 6,
 'RB': 7,
 'VB': 8}

In [180]:
x = 'NN'

In [183]:
x

'NN'

In [185]:
pos_p1[0][0]

'NN'

(5, ['NN'])

In [130]:
dep = []
for i in range(8000):
    dep.append(dep_p1[i])

In [131]:
for i in range(8000):
    dep.append(dep_p2[i])

In [132]:
len(dep)

16000

In [133]:
word_freq = nltk.FreqDist(chain(*dep))

In [141]:
word_freq.most_common(20)

[('root', 9272),
 ('nmod', 8340),
 ('nsubj', 4337),
 ('dobj', 3590),
 ('nsubjpass', 1497),
 ('compound', 889),
 ('conj', 703),
 ('acl', 637),
 ('advcl', 553),
 ('ccomp', 451),
 ('amod', 445),
 ('acl:relcl', 412),
 ('xcomp', 318),
 ('dep', 178),
 ('appos', 161),
 ('nmod:poss', 139),
 ('advmod', 54),
 ('parataxis', 48),
 ('csubj', 28),
 ('iobj', 17)]

In [154]:
word_p1

[['configuration'],
 ['child', 'wrapped'],
 ['author', 'uses'],
 ['ridge', 'uprises'],
 ['student', 'association'],
 ['complex'],
 ['inflammation', 'caused'],
 ['People', 'moving'],
 ['lawsonite', 'contained'],
 ['solvent', 'mL', 'pipetted'],
 ['essays'],
 ['composer', 'sunk'],
 ['citation'],
 ['burst', 'caused'],
 ['networks', 'moved'],
 ['call'],
 ['virtuoso', 'finds'],
 ['factory', 'products', 'included'],
 ['tree', 'blossom'],
 ['battalion'],
 ['knowledge', 'gained'],
 ['stable'],
 ['singer', 'caused'],
 ['essays', 'books', 'remain'],
 ['brace'],
 ['dress', 'made'],
 ['Suicide', 'one'],
 ['article', 'gives'],
 ['meeting'],
 ['timer'],
 ['offer', 'made'],
 ['headaches', 'pains', 'had'],
 ['country', 'father'],
 ['stack', 'loading'],
 ['plumbing'],
 ['electricity', 'produce'],
 ['plan', 'ran'],
 ['provinces', 'divided'],
 ['stress', 'one'],
 ['Newspapers', 'swap'],
 ['women'],
 ['transmitter', 'discovered'],
 ['chain'],
 ['student', 'released'],
 ['Calluses', 'caused'],
 ['Adults', '

In [194]:
for i in range(8000):
    for j, word in enumerate(word_p1[i]):
        word = word.lower()
        word_p1[i][j] = word if word in word2id else unknown_token 
    for k, word in enumerate(word_p2[i]):
        word = word.lower()
        word_p2[i][k] = word if word in word2id else unknown_token 
    for l, d in enumerate(dep_p1[i]):
        dep_p1[i][l] = d if d in dep2id else 'OTH'
    for m, d in enumerate(dep_p2[i]):
        dep_p2[i][m] = d if d in dep2id else 'OTH'

In [173]:
word_p1_ids = np.asarray([[word2id[w] for w in sent] for sent in word_p1])
word_p2_ids = np.asarray([[word2id[w] for w in sent] for sent in word_p2])

In [193]:
pos_p1_ids = np.asarray([[pos_tag(w) for w in p] for p in pos_p1])
pos_p2_ids = np.asarray([[pos_tag(w) for w in p] for p in pos_p2])

In [None]:
dep_p1_ids = np.asarray([[dep2id[w] for w in d] for d in dep_p1])
dep_p1_ids = np.asarray([[dep2id[w] for w in d] for d in dep_p1])
