In [1]:
# !wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv

In [2]:
import nltk
import tensorflow as tf
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from tqdm import tqdm
import collections
from unidecode import unidecode
from sklearn.model_selection import train_test_split
wnlmt = nltk.WordNetLemmatizer().lemmatize


In [3]:
def build_dataset(words, n_words):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            val = dic[k] if k in dic else UNK
            X[i,-1 - no]= val
    return X

def wn_pos(word_tag): #词性标注
    if word_tag.startswith('J'):
        return wordnet.ADJ
    elif word_tag.startswith('R'):
        return wordnet.ADV
    elif word_tag.startswith('V'):
        return wordnet.VERB
    else:
        return wordnet.NOUN

def lemmatized(tokenized): #词形还原
    word_pos_tags = nltk.pos_tag(tokenized)
    lemmatized = []
    for each in word_pos_tags:
        lemmatized.append(wnlmt(each[0], wn_pos(each[1])))
    return lemmatized

def cleaning(string): #预处理
    stopwd = set(stopwords.words('english'))
    string = re.sub('[^A-Za-z\- ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    tokenized = nltk.word_tokenize(string.lower())
    tokenized1 = [x for x in tokenized if not x in stopwd]
    tokenized2 = lemmatized(tokenized1)
    string.join(tokenized2)
    return string

In [4]:
df = pd.read_csv('train.tsv', delimiter='\t').dropna()
df.head()

Unnamed: 0,id,tid1,tid2,text1,text2,is_duplicate
0,0,0,1,How is the life of a math student ? Could you ...,Which level of prepration be enough for the ex...,0
1,1,2,3,How do I control my horny emotion ?,how do you control your horniness ?,1
2,2,4,5,what causes stool color to change to yellow ?,What can cause stool to come out as little ball ?,0
3,3,6,7,what can one do after MBBS ?,What do i do after my mbb ?,1
4,4,8,9,where can I find a power outlet for my laptop ...,"Would a second airport in Sydney , Australia b...",0


In [5]:
left, right, label = df['text1'].tolist(), df['text2'].tolist(), df['is_duplicate'].tolist()

In [6]:
np.unique(label, return_counts = True)

(array([0, 1]), array([27554, 16320], dtype=int64))

In [7]:
for i in tqdm(range(len(left))):
    left[i] = cleaning(left[i])
    right[i] = cleaning(right[i])

100%|███████████████████████████████████████████████████████████████████████████| 43874/43874 [01:29<00:00, 489.76it/s]


In [8]:
concat = ' '.join(left + right).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 35431
Most common words [('the', 41151), ('be', 31228), ('I', 24078), ('a', 23084), ('to', 22216), ('do', 21413)]
Sample data [18, 16, 4, 70, 12, 7, 223, 183, 549, 20] ['How', 'is', 'the', 'life', 'of', 'a', 'math', 'student', 'Could', 'you']


In [9]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, learning_rate, dropout):
        
        def cells(size, reuse=False):
            cell = tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse)
            return tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=dropout)
        
        def birnn(inputs, scope):
            with tf.variable_scope(scope, reuse = tf.AUTO_REUSE):
                for n in range(num_layers):
                    (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw = cells(size_layer // 2),
                        cell_bw = cells(size_layer // 2),
                        inputs = inputs,
                        dtype = tf.float32,
                        scope = 'bidirectional_rnn_%d'%(n))
                    inputs = tf.concat((out_fw, out_bw), 2)
                return inputs[:,-1]
        
        self.X_left = tf.placeholder(tf.int32, [None, None])
        self.X_right = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None])
        self.batch_size = tf.shape(self.X_left)[0]
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        embedded_left = tf.nn.embedding_lookup(encoder_embeddings, self.X_left)
        embedded_right = tf.nn.embedding_lookup(encoder_embeddings, self.X_right)
        
        def contrastive_loss(y,d):
            tmp= y * tf.square(d)
            tmp2 = (1-y) * tf.square(tf.maximum((1 - d),0))
            return tf.reduce_sum(tmp +tmp2)/tf.cast(self.batch_size,tf.float32)/2
        
        self.output_left = birnn(embedded_left, 'left')
        self.output_right = birnn(embedded_right, 'right')
        self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.subtract(self.output_left,self.output_right)),
                                              1,keep_dims=True))
        self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.output_left),
                                                                           1,keep_dims=True)),
                                                     tf.sqrt(tf.reduce_sum(tf.square(self.output_right),
                                                                           1,keep_dims=True))))
        self.distance = tf.reshape(self.distance, [-1])
        self.cost = contrastive_loss(self.Y,self.distance)
        
        self.temp_sim = tf.subtract(tf.ones_like(self.distance),
                                    tf.rint(self.distance))
        correct_predictions = tf.equal(self.temp_sim, self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)

In [10]:
size_layer = 256
num_layers = 2
embedded_size = 128
learning_rate = 1e-3
maxlen = 50
batch_size = 128
dropout = 0.8

In [11]:
from sklearn.model_selection import train_test_split

vectors_left = str_idx(left, dictionary, maxlen)
vectors_right = str_idx(right, dictionary, maxlen)
train_X_left, test_X_left, train_X_right, test_X_right, train_Y, test_Y = train_test_split(vectors_left,
                                                                                           vectors_right,
                                                                                           label,
                                                                                           test_size = 0.2)

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(dictionary),learning_rate,dropout)
sess.run(tf.global_variables_initializer())

W0823 14:13:40.013050  1404 deprecation.py:323] From <ipython-input-9-679a746d3c87>:6: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W0823 14:13:51.052431  1404 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0823 14:13:51.077724  1404 deprecation.py:323] From <ipython-input-9-679a746d3c87>:17: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.Bi

In [13]:
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(range(0, len(train_X_left), batch_size), desc='train minibatch loop')
    for i in pbar:
        batch_x_left = train_X_left[i:min(i+batch_size,train_X_left.shape[0])]
        batch_x_right = train_X_right[i:min(i+batch_size,train_X_left.shape[0])]
        batch_y = train_Y[i:min(i+batch_size,train_X_left.shape[0])]
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        assert not np.isnan(loss)
        train_loss += loss
        train_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    pbar = tqdm(range(0, len(test_X_left), batch_size), desc='test minibatch loop')
    for i in pbar:
        batch_x_left = test_X_left[i:min(i+batch_size,train_X_left.shape[0])]
        batch_x_right = test_X_right[i:min(i+batch_size,train_X_left.shape[0])]
        batch_y = test_Y[i:min(i+batch_size,train_X_left.shape[0])]
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X_left : batch_x_left, 
                                        model.X_right: batch_x_right,
                                        model.Y : batch_y})
        test_loss += loss
        test_acc += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
    
    train_loss /= (len(train_X_left) / batch_size)
    train_acc /= (len(train_X_left) / batch_size)
    test_loss /= (len(test_X_left) / batch_size)
    test_acc /= (len(test_X_left) / batch_size)
    
    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
    
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))

train minibatch loop: 100%|███████████████████████████████| 275/275 [03:20<00:00,  1.39it/s, accuracy=0.63, cost=0.121]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.79it/s, accuracy=0.746, cost=0.0919]


epoch: 0, pass acc: 0.000000, current acc: 0.668097
time taken: 214.2397837638855
epoch: 0, training loss: 0.111672, training acc: 0.650548, valid loss: 0.109145, valid acc: 0.668097



train minibatch loop: 100%|██████████████████████████████| 275/275 [03:30<00:00,  1.48it/s, accuracy=0.519, cost=0.113]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.59it/s, accuracy=0.775, cost=0.0882]


epoch: 0, pass acc: 0.668097, current acc: 0.679562
time taken: 224.59613633155823
epoch: 0, training loss: 0.106515, training acc: 0.674104, valid loss: 0.106699, valid acc: 0.679562



train minibatch loop: 100%|██████████████████████████████| 275/275 [03:18<00:00,  1.64it/s, accuracy=0.63, cost=0.0997]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.71it/s, accuracy=0.789, cost=0.0901]


epoch: 0, pass acc: 0.679562, current acc: 0.690593
time taken: 212.37023782730103
epoch: 0, training loss: 0.102890, training acc: 0.687643, valid loss: 0.105111, valid acc: 0.690593



train minibatch loop: 100%|█████████████████████████████| 275/275 [03:18<00:00,  1.68it/s, accuracy=0.667, cost=0.0903]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.75it/s, accuracy=0.761, cost=0.0846]


epoch: 0, pass acc: 0.690593, current acc: 0.699983
time taken: 212.57806158065796
epoch: 0, training loss: 0.098942, training acc: 0.707693, valid loss: 0.102661, valid acc: 0.699983



train minibatch loop: 100%|█████████████████████████████| 275/275 [03:17<00:00,  1.69it/s, accuracy=0.815, cost=0.0789]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.89it/s, accuracy=0.803, cost=0.0806]


time taken: 210.40756630897522
epoch: 0, training loss: 0.095467, training acc: 0.724189, valid loss: 0.101625, valid acc: 0.698206



train minibatch loop: 100%|█████████████████████████████| 275/275 [03:17<00:00,  1.69it/s, accuracy=0.852, cost=0.0754]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.87it/s, accuracy=0.817, cost=0.0776]


epoch: 0, pass acc: 0.699983, current acc: 0.702172
time taken: 210.89612555503845
epoch: 0, training loss: 0.092274, training acc: 0.739566, valid loss: 0.100399, valid acc: 0.702172



train minibatch loop: 100%|█████████████████████████████| 275/275 [03:19<00:00,  1.39it/s, accuracy=0.815, cost=0.0674]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.39it/s, accuracy=0.831, cost=0.0786]


epoch: 0, pass acc: 0.702172, current acc: 0.708760
time taken: 213.8218686580658
epoch: 0, training loss: 0.089420, training acc: 0.750856, valid loss: 0.100102, valid acc: 0.708760



train minibatch loop: 100%|█████████████████████████████| 275/275 [03:18<00:00,  1.69it/s, accuracy=0.889, cost=0.0646]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.82it/s, accuracy=0.803, cost=0.0769]


epoch: 0, pass acc: 0.708760, current acc: 0.710514
time taken: 212.0547821521759
epoch: 0, training loss: 0.086522, training acc: 0.760243, valid loss: 0.099528, valid acc: 0.710514



train minibatch loop: 100%|█████████████████████████████| 275/275 [03:19<00:00,  1.67it/s, accuracy=0.889, cost=0.0526]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.09it/s, accuracy=0.803, cost=0.0773]


epoch: 0, pass acc: 0.710514, current acc: 0.710742
time taken: 213.16681265830994
epoch: 0, training loss: 0.083244, training acc: 0.775942, valid loss: 0.100288, valid acc: 0.710742



train minibatch loop: 100%|█████████████████████████████| 275/275 [03:20<00:00,  1.68it/s, accuracy=0.926, cost=0.0569]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.74it/s, accuracy=0.803, cost=0.0782]


time taken: 213.38555002212524
epoch: 0, training loss: 0.080713, training acc: 0.785764, valid loss: 0.101027, valid acc: 0.699118



train minibatch loop: 100%|██████████████████████████████| 275/275 [03:19<00:00,  1.67it/s, accuracy=0.926, cost=0.057]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.76it/s, accuracy=0.761, cost=0.0789]


time taken: 213.549631357193
epoch: 0, training loss: 0.079407, training acc: 0.790237, valid loss: 0.101414, valid acc: 0.694741



train minibatch loop: 100%|█████████████████████████████| 275/275 [03:18<00:00,  1.69it/s, accuracy=0.926, cost=0.0537]
test minibatch loop: 100%|████████████████████████████████| 69/69 [00:13<00:00,  5.83it/s, accuracy=0.761, cost=0.0797]


time taken: 212.27347779273987
epoch: 0, training loss: 0.076916, training acc: 0.800379, valid loss: 0.102748, valid acc: 0.683459

break epoch:0



In [14]:
#测试输出
import pandas as pd
tf = pd.read_csv('test.tsv', delimiter='\t').dropna()
tf.head()

Unnamed: 0,id,tid1,tid2,text1,text2
0,363849,96631,12526,which be good movie in history ?,which are best movie in history ?
1,363850,494389,494390,Why be climate scientists say that the situati...,Will mosquitoes be bad in DC this year because...
2,363851,77966,37913,How can I lose weight in a month without do ex...,How can I loose weight naturally without do ex...
3,363852,48345,43382,How be our universe before the Big Bang ? Was ...,what actually existed before the Big Bang ?
4,363853,494391,494392,"what is a "" warm "" "" sim card ? """,how do I spoil a SIM card ?


In [15]:
left0, right0 = tf['text1'].tolist(), tf['text2'].tolist()

In [16]:
left = str_idx(left0, dictionary, maxlen)
right = str_idx(right0, dictionary, maxlen)
array0, array1 = sess.run([model.temp_sim,1-model.distance], feed_dict = {model.X_left : left, 
                                        model.X_right: right})
list = array0.tolist()
name = ['is_duplicate']
test = pd.DataFrame(columns=name,data=list)
test.to_csv('testcsv.csv',encoding='utf-8')