# Tim Molleman & David Ruhe: Quora Question Pairs

## Imports

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch import autograd
from sklearn.preprocessing import minmax_scale
import numpy as np
import torch.nn.functional as F
import zipfile
import urllib
import re
import csv
import gensim
import pandas as pd

In [None]:
# Use GPU?
cuda = False

## Preprocessing

In [None]:
# Load word vectors.
word_vecs = gensim.models.KeyedVectors.load_word2vec_format('Data/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# Function for feature extraction.

def get_extra_features(S1, S2):
    word_count_s1 = []
    word_count_s2 = []
    word_count_diff = []
    char_count_s1 = []
    char_count_s2 = []
    char_count_diff = []
    avg_word1 = []
    avg_word2 = []
    avg_word_diff = []
    first_same = []
    last_same = []
    both_same = []
    sentence_same = []
    intersections = []
    intersections_ratio = []
    
    for s1, s2 in zip(S1, S2):
        words_s1 = len(str(s1).split())
        words_s2 = len(str(s2).split())
        wordsdiff = words_s1 - words_s2
        chars_s1 = len(str(s1).replace(' ', ''))
        chars_s2 = len(str(s2).replace(' ', ''))
        charsdiff = chars_s1 - chars_s2
        avg_word_length1 = chars_s1 / words_s1 if words_s1 > 0 else 0
        avg_word_length2 = chars_s2 / words_s2 if words_s2 > 0 else 0
        avg_word_length_diff = avg_word_length1 - avg_word_length2
        wfirst_same = 0
        wlast_same = 0
        wboth_same = 0
        
        if words_s1 > 0 and words_s2 > 0:
            wfirst_same = 1 if s1.split()[0] == s2.split()[0] else 0
            wlast_same = 1 if s1.split()[-1] == s2.split()[-1] else 0
            wboth_same = 1 if first_same == 1 and last_same == 1 else 0
        
        s_same = 1 if s1 == s2 else 0
        
        s1s2ints = len(set(s1.split()).intersection(set(s2.split())))
        
        s1s2ratio = ((s1s2ints * 2) / (words_s1 + words_s2)) if words_s1 > 0 or words_s2 > 0 else 0
        
        word_count_s1.append(words_s1)
        word_count_s2.append(words_s2)
        word_count_diff.append(wordsdiff)
        char_count_s1.append(chars_s1)
        char_count_s2.append(chars_s2)
        char_count_diff.append(charsdiff)
        avg_word1.append(avg_word_length1)
        avg_word2.append(avg_word_length2)
        avg_word_diff.append(avg_word_length_diff)
        first_same.append(wfirst_same)
        last_same.append(wlast_same)
        both_same.append(wboth_same)
        sentence_same.append(s_same)
        intersections.append(s1s2ints)
        intersections_ratio.append(s1s2ratio)

    
    # Minmax scaling
    word_count_s1 = minmax_scale(word_count_s1)
    word_count_s2 = minmax_scale(word_count_s2)
    word_count_diff = minmax_scale(word_count_diff)
    char_count_s1 = minmax_scale(char_count_s1)
    char_count_s2 = minmax_scale(char_count_s2)
    char_count_diff = minmax_scale(char_count_diff)
    intersections = minmax_scale(intersections)
    intersections_ratio = minmax_scale(intersections_ratio)
    
    all_feats = [[word_count_s1[i], word_count_s2[i], word_count_diff[i], char_count_s1[i],
           char_count_s2[i], char_count_diff[i], avg_word1[i], avg_word2[i], avg_word_diff[i],
           first_same[i], last_same[i], both_same[i], sentence_same[i], intersections[i],
           intersections_ratio[i]] for i in range(len(S1))]
    
    return all_feats

In [None]:
with open('Data/train_official.csv') as trf: # This is the train file from https://www.kaggle.com/c/quora-question-pairs
    next(trf) # skip header.
    rows = csv.reader(trf)
    train_data = list(zip(*rows))

In [None]:
with open('Data/test_data.csv') as tf:
    next(tf) # skip header.
    rows = csv.reader(tf)
    test_data = list(zip(*rows))

In [None]:
print(len(train_data[1]))
print(len(test_data[1]))

In [None]:
# Load data into variables.
train_s1 = train_data[3]
train_s2 = train_data[4]
training_labels = list(map(int, train_data[5])) # Convert to ints.

test_s1 = test_data[1]
test_s2 = test_data[2]
test_ids = test_data[0]

In [None]:
# Cleaning function. Borrowed from: https://www.kaggle.com/currie32/the-importance-of-cleaning-text/notebook
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

In [None]:
# Cleaning
train_s1 = [clean_text(s1) for s1 in train_s1]
train_s2 = [clean_text(s2) for s2 in train_s2]
test_s1 = [clean_text(s1) for s1 in test_s1]
test_s2 = [clean_text(s2) for s2 in test_s2]

### Establish word_to_ix dictionary

In [None]:
from collections import Counter

EMBEDDING_DIM = word_vecs['the'].shape[0]
all_words = [word for sentence in (train_s1 + train_s2 + test_s1 + test_s2) for word in sentence.split()]
counts = Counter(all_words)
corpus = [word[0] for word in counts.most_common()]


VOCAB_SIZE = 47500

print(len(corpus))

assert VOCAB_SIZE <= len(corpus)

def get_vocabulary_and_embeddings(corpus):
    print ("Loading vocabulary")
    vocabulary = {'<PAD>': 0, '<EOS>': 1, '<UNKNOWN>': 2}
    emb_matrix = np.zeros([VOCAB_SIZE, EMBEDDING_DIM])
    
    n = 3
    for i, word in enumerate(corpus):
        if word in word_vecs:
            if word not in vocabulary.keys():
                vocabulary[word] = n
                emb_matrix[n] = word_vecs[word]
                n += 1

#                 if n%100 == 0:
#                     print ("Words: %s" % n)

                if n == VOCAB_SIZE:
                    print ("Loaded vocabulary and embeddings.")

                    
                    return vocabulary, emb_matrix
            
word_to_ix, emb_matrix = get_vocabulary_and_embeddings(corpus)
ix_to_word = {ix:word for word, ix in word_to_ix.items()}

assert len(word_to_ix) == emb_matrix.shape[0]

### Extract features from sentence pairs.

In [None]:
extra_features = get_extra_features(train_s1, train_s2)
extra_features_test = get_extra_features(test_s1, test_s2)

assert len(test_s1) == len(extra_features_test)

In [None]:
# Shuffle data.
from random import shuffle
VALID_SIZE = 20000

ixs = list(range(len(training_labels)))

shuffle(ixs)

train_s1_shuf = []
train_s2_shuf = []
train_lab_shuf = []
train_features_shuf = []

valid_s1 = []
valid_s2 = []
valid_lab = []
valid_features = []

for i in ixs:
    if len(valid_s1) < VALID_SIZE:
        valid_s1.append(train_s1[i])
        valid_s2.append(train_s2[i])
        valid_lab.append(training_labels[i])
        valid_features.append(extra_features[i])

    
    else:
        train_s1_shuf.append(train_s1[i])
        train_s2_shuf.append(train_s2[i])
        train_lab_shuf.append(training_labels[i])
        train_features_shuf.append(extra_features[i])


train_s1 = train_s1_shuf
train_s2 = train_s2_shuf
training_labels = train_lab_shuf

print(len(training_labels))
print(len(valid_lab))

## Sentence Preparation

In [None]:
# Padder.
def pad_seq(seq, max_length):
    
    assert len(seq) <= 40
    seq += [0 for i in range(max_length - len(seq))]
    return seq  

# Convert sentence to ix.
def prepare_sequence(seq, to_ix):
    
    idxs = [to_ix[w] if w in to_ix.keys() else to_ix['<UNKNOWN>'] for w in seq.split()]
    idxs = idxs[:39] + [to_ix['<EOS>']]
    
    assert len(idxs) <= 40
        
    return idxs


In [None]:
# Batch Generator.
def generator(batch_size, s1, s2, labels, features, test_mode=False):
    
    if test_mode == False:
        assert len(s1) == len(s2) == len(labels)
    
    size = len(s1)
    n_batches = int(np.ceil(size / batch_size))
    batch_idx = 0
    
    for _ in range(n_batches):
        
        input_s1 = [prepare_sequence(sentence, word_to_ix) for sentence in s1[batch_idx:(batch_idx+batch_size)]]
        input_s2 = [prepare_sequence(sentence, word_to_ix) for sentence in s2[batch_idx:(batch_idx+batch_size)]]
                
        input_s1_pad = [pad_seq(s, 40) for s in input_s1]
        input_s2_pad = [pad_seq(s, 40) for s in input_s2]
        
        extra_features = features[batch_idx:(batch_idx + batch_size)]
        extra_features = torch.FloatTensor(extra_features)
        
        input_s1 = torch.LongTensor(input_s1_pad).view(len(input_s1), -1)  
        
        input_s2 = torch.LongTensor(input_s2_pad).view(len(input_s2), -1)  
        
        if test_mode == False:
            targets = labels[batch_idx:(batch_idx+batch_size)]
            targets = torch.FloatTensor(targets).view(len(targets), 1) # 1 = tagset size.
        
        if test_mode == False:
            assert len(targets) == len(input_s1) == len(input_s2)
        
        batch_idx += batch_size

        if test_mode == False:
            yield Variable(input_s1), Variable(input_s2), Variable(targets), Variable(extra_features)
        else:
            yield Variable(input_s1), Variable(input_s2), Variable(extra_features)

In [None]:
for input_s1, input_s2, targets, extra_feats in generator(2, train_s1, train_s2, training_labels, extra_features):
    print(train_s1[:2])
    print(train_s2[:2])
    print(input_s1)
    print(input_s2)
    print(targets)
    print(extra_feats)
    break

## Hyperparams

In [None]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 150
DENSE_DIM = 150
FEATS_DIM = 64
BATCH_SIZE = 256
TAGSET_SIZE = 1
VOCAB_SIZE = len(word_to_ix)
LEARNING_RATE = 1e-2
N_EPOCH = 45
DROP_RATE = 0.25
EXTRA_FEATURES = len(extra_features[0])
MAX_LEN = 40

In [None]:
# Learning rate adjuster. This turned out not to be much of a help, and oftentimes we wouldn't use it.

def adjust_learning_rate(optimizer, epoch):
    lr = LEARNING_RATE * (0.9 ** (epoch // 10))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

In [None]:
# Cooijmans(2017) hidden state batchnormalizer.
from bnlstm import SeparatedBatchNorm1d

# Model

In [None]:
class SIAMLSTM(nn.Module):
    def __init__(self, embedding_dim, vocab_size, hidden_dim, dense_dim, feats_size, tagset_size, batch_size, drop_rate, num_feats):
        super(SIAMLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.dense_dim = dense_dim
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout = drop_rate)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.linmap = nn.Linear(2 * hidden_dim * MAX_LEN, dense_dim)
        self.linfeats = nn.Linear(num_feats, feats_size)
        
        self.fc = nn.Linear(dense_dim + feats_size, tagset_size) 
        
        self.drop = nn.Dropout(0.5)
        self.bn1 = nn.BatchNorm1d(2*MAX_LEN)
        self.bn2 = nn.BatchNorm1d(dense_dim)
        self.bn3 = nn.BatchNorm1d(dense_dim + feats_size)
        self.hidbn = SeparatedBatchNorm1d(hidden_dim, MAX_LEN)
        
        self.hidden_s1 = self.init_hidden()
        self.hidden_s2 = self.init_hidden()
        
    def init_hidden(self):
        c0 = autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        h0 = autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        
        if cuda:
            c0 = c0.cuda()
            h0 = h0.cuda()
        
        return (c0, h0)
        
    def forward(self, input_s1, input_s2, extra_feats):
                
        embedding_s1 = self.embedding(input_s1)
        embedding_s2 = self.embedding(input_s2)
        
        s1 = embedding_s1.permute(1,0,2)
        s2 = embedding_s2.permute(1,0,2)
        
        # Forward propagate RNN       
        out1, hidden_s1 = self.lstm(s1, self.hidden_s1)
        out2, hidden_s2 = self.lstm(s2, self.hidden_s2)

        self.hidden_s1 = (self.hidbn(hidden_layer) for hidden_layer in hidden_s1)
        self.hidden_s2 = (self.hidbn(hidden_layer) for hidden_layer in hidden_s2)
        
        out1 = out1.permute(1,0,2).contiguous()
        out2 = out2.permute(1,0,2).contiguous()
        
        out = torch.cat((out1,out2), dim=1)
        
        out = self.drop(out)
        out = self.bn1(out)
        
        out = F.relu(self.linmap(out.view(len(out), -1)))
        
        out = self.drop(out)
        out = self.bn2(out)
                
        out_feats = self.linfeats(extra_feats)
        
        out = torch.cat((out, out_feats), dim=1)
        
        out = self.drop(out)
        out = self.bn3(out)
        
        out = self.fc(out)
                
        return out

model = SIAMLSTM(EMBEDDING_DIM, VOCAB_SIZE, HIDDEN_DIM, DENSE_DIM, FEATS_DIM, TAGSET_SIZE, BATCH_SIZE, DROP_RATE, EXTRA_FEATURES)

# Embeddings can't be trained.
pre_trained_embs = torch.from_numpy(emb_matrix).float()
model.embedding.weight.data = pre_trained_embs
model.embedding.weight.requires_grad = False
parameters = filter(lambda p: p.requires_grad, model.parameters())

if cuda:
    model = model.cuda()

# Loss and Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(parameters, lr=LEARNING_RATE)

## Runtime

In [None]:
train_loss = []
val_loss = []
train_acc = []
val_acc = []
lowest_loss = 200
best_score = 0.6

for epoch in range(N_EPOCH):
    
    print("Starting epoch %s" % epoch)
            
    optimizer = adjust_learning_rate(optimizer, epoch) # Optional.
    
    train_batch_loss = []
    train_batch_acc = []
    
    for input_s1, input_s2, labels, extra_feats in generator(BATCH_SIZE, train_s1, train_s2, training_labels, train_features_shuf):
        
        if cuda:
            input_s1, input_s2, labels, extra_feats = input_s1.cuda(), input_s2.cuda(), labels.cuda(), extra_feats.cuda()
            
        assert len(input_s1) == len(input_s2) == len(labels)

        model.zero_grad()
        
        model.batch_size = len(labels)
        
        model.hidden_s1 = model.init_hidden()
        model.hidden_s2 = model.init_hidden()   
        
        outputs = model(input_s1, input_s2, extra_feats)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        predicted = F.sigmoid(outputs)
        predicted[predicted <= 0.5] = 0
        predicted[predicted > 0.5] = 1
        
        correct = predicted == labels
        
        batch_acc = len(correct[correct == 1]) / len(labels)
                
        train_batch_acc.append(batch_acc)
        train_batch_loss.append(loss.data[0])
    
    train_loss.append(np.mean(train_batch_loss))
    train_acc.append(np.mean(train_batch_acc))
    
    val_batch_loss = []
    val_batch_acc = []
    
    # Validation Epoch.
    
    for val_s1, val_s2, val_lab, extra_feats_val in generator(BATCH_SIZE, valid_s1, valid_s2, valid_lab, valid_features):
        
        if cuda:
            val_s1, val_s2, val_lab, extra_feats_val = val_s1.cuda(), val_s2.cuda(), val_lab.cuda(), extra_feats_val.cuda()

        assert len(val_s1) == len(val_s2) == len(val_lab)
        
        model.batch_size = len(val_lab)
        model.hidden_s1 = model.init_hidden()
        model.hidden_s2 = model.init_hidden()   

        val_outputs = model(val_s1, val_s2, extra_feats_val)
        
        batch_val_loss = criterion(val_outputs, val_lab)
        
        predicted = F.sigmoid(val_outputs)
        
        predicted[predicted <= 0.5] = 0
        predicted[predicted > 0.5] = 1
        
        correct = predicted == val_lab
        
        batch_val_acc = len(correct[correct == 1]) / len(val_lab)
                        
        val_batch_acc.append(batch_val_acc)
        val_batch_loss.append(batch_val_loss.data[0])    
    
    
    val_loss.append(np.mean(val_batch_loss))
    val_acc.append(np.mean(val_batch_acc))
    
    if np.mean(val_batch_acc) > best_score:
        print("New best score! %s" % np.mean(val_batch_acc))
        torch.save(model.state_dict(), 'submission_lowloss.pt')
        best_score = np.mean(val_batch_acc)
    
    if np.mean(val_batch_loss) < lowest_loss:
        print("New lowest loss! %s" % np.mean(val_batch_loss))
        torch.save(model.state_dict(), 'submission_highacc.pt')
        lowest_loss = np.mean(val_batch_loss)
    
    print('[Epoch: %3d/%3d] Training Loss: %.3f, Testing Loss: %.3f, Training Acc: %.3f, Testing Acc: %.3f'
        % (epoch, N_EPOCH - 1, train_loss[epoch], val_loss[epoch], train_acc[epoch], val_acc[epoch]))
    

# Submission

In [None]:
def make_submission(model, path, test_s1, test_s2, extra_features_test, test_ids, filename):
    
    predictions = []
    submission = pd.DataFrame()
    if cuda: model.load_state_dict(torch.load(path))
    else : model.load_state_dict(torch.load(path, map_location=lambda storage, loc: storage))
    model.eval()
       
    for t1, t2, extra_t in generator(BATCH_SIZE, test_s1, test_s2, None, extra_features_test, test_mode=True):
        
        if cuda:
            t1, t2, extra_t = t1.cuda(), t2.cuda(), extra_t.cuda()
        
        model.batch_size = len(t1)
        model.hidden_s1 = model.init_hidden()
        model.hidden_s2 = model.init_hidden()   

        val_outputs = model(t1, t2, extra_t)
                
        predicted = F.sigmoid(val_outputs)
        
        predicted[predicted <= 0.5] = 0
        predicted[predicted > 0.5] = 1
                
        predictions += list(map(int, predicted.data.cpu().numpy()))
        
        
    assert len(predictions) == len(test_ids)
    
    submission['test_id'] = test_ids
    submission['is_duplicate'] = predictions
    filename = filename + '.csv'
    submission.to_csv(filename, index=False)
    

In [None]:
make_submission(model, 'submission_lowloss.pt', test_s1, test_s2, extra_features_test, test_ids, 'submission_lowloss')