In [0]:
import pandas as pd
from scipy.sparse import hstack
import torch
from sklearn import *
import sklearn
import re
from torchtext import data
import numpy as np
import torch.nn as nn

In [9]:
with open('q1_train_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q1_train = [x.strip() for x in content] 

with open('q2_train_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q2_train = [x.strip() for x in content] 

with open('q1_test_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q1_test = [x.strip() for x in content] 

with open('q2_test_cleaned.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
q2_test = [x.strip() for x in content] 

with open('test_labels.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
labels_test = [x.strip() for x in content] 

with open('train_labels.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
labels_train = [x.strip() for x in content] 

len(q1_train), len(q2_train), len(q1_test), len(q2_test), len(labels_train), len(labels_test)

(291088, 291088, 32344, 32344, 291088, 32344)

In [10]:
train_df = pd.DataFrame(np.stack((q1_train, q2_train, labels_train), axis=-1), columns =['question1', 'question2', 'is_duplicate'])
train_df.head(10)

Unnamed: 0,question1,question2,is_duplicate
0,how do i get home tutor,how can i trust a home tutor,0
1,what be the difference between have be have be...,when should i use have be have be and have be,0
2,if my atm card be block for online transaction...,my credit card be use for fraud transaction i ...,0
3,how do i add usb 3.0 port in a laptop without ...,can i use a usb 3.0 device in a usb 2.0 port,0
4,what be the best advantage of use Quora,what be the benefit to Quora,1
5,where can i download the economist pdf,where can i download pdf of Gillian Glynn go girl,0
6,why do some people get everything,why be that some people get what they want ver...,1
7,be an all out nuclear war survivable,would all out nuclear war destroy all life on ...,1
8,what be the advantage of ashless transaction,what could happen to ashless transaction after...,0
9,what be the main problem of India,what be the main problem face by India,1


In [11]:
test_df = pd.DataFrame(np.stack((q1_test, q2_test, labels_test), axis=-1), columns =['question1', 'question2', 'is_duplicate'])
test_df.head(10)

Unnamed: 0,question1,question2,is_duplicate
0,what be it like to travel in afghanistan,what be it like for a foreigner to travel in a...,0
1,do rapper really make a much money a they clai...,can i make money from lyric video on youtube,0
2,do mary give a natural vaginal birth to Jesus ...,how be the labour of mary and her birth give t...,0
3,how can you determine the lewis structure for o2,how be the lewis structure for propane determine,0
4,where do plex get poster art thumbnail for tv ...,who design the ux and ui of plex medium center,0
5,be it possible to know if somebody else tumble...,can anyone see a list of my secondary blo i wr...,0
6,if you could have only money fame or love in y...,how do one define success be it the way one Ci...,0
7,how do you say also in Japanese be there more ...,how do you say ocean in Japanese be there more...,0
8,how good bad will it be for India if trump win...,if Donald trump win the presidential election ...,1
9,why can not i see view on my instagram video,how can i see who view my instagram,1


In [0]:
train_df.to_csv('train_df.csv', index=False)
test_df.to_csv('test_df.csv', index=False)

In [0]:
#define fields
question1 = data.Field(tokenize='spacy')
question2 = data.Field(tokenize='spacy')
label = data.LabelField(dtype=torch.float)

fields = [('question1', question1), ('question2', question2), ('is_duplicate', label)]

train_data, test_data = data.TabularDataset.splits(path ='.', train= 'train_df.csv', test= 'test_df.csv', format = 'csv', fields = fields, skip_header = True)


In [16]:
#build vocabulary for training dataset using dictionary from torch text 
MAX_VOCAB_SIZE = 60000
question1.build_vocab(train_data.question1, train_data.question2, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)
question2.build_vocab(train_data.question1, train_data.question2, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)
label.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)

.vector_cache/glove.6B.zip: 862MB [06:26, 2.23MB/s]                           
 99%|█████████▉| 397926/400000 [00:14<00:00, 27843.88it/s]

In [17]:
#length of vocabulary for training dataset (words + 1(unk) + 1(pad))
len(question1.vocab), len(question2.vocab)

(58560, 58560)

In [18]:
#10 first words of the vocabuary for training dataset
question1.vocab.itos[:10], question2.vocab.itos[:10]

(['<unk>', '<pad>', 'be', 'the', 'what', 'do', 'a', 'how', 'i', 'to'],
 ['<unk>', '<pad>', 'be', 'the', 'what', 'do', 'a', 'how', 'i', 'to'])

In [19]:
#the most common words of the vocabulary for training dataset 
question1.vocab.freqs.most_common(10), question2.vocab.freqs.most_common(10)

([('be', 367533),
  ('the', 271817),
  ('what', 234474),
  ('do', 174303),
  ('a', 168740),
  ('how', 158566),
  ('i', 156106),
  ('to', 148296),
  ('in', 141992),
  ('of', 115433)],
 [('be', 367533),
  ('the', 271817),
  ('what', 234474),
  ('do', 174303),
  ('a', 168740),
  ('how', 158566),
  ('i', 156106),
  ('to', 148296),
  ('in', 141992),
  ('of', 115433)])

In [0]:
#build vocabulary for test dataset using dictionary from torch text 
MAX_VOCAB_SIZE = 60000
question1.build_vocab(test_data.question1, test_data.question2, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)
question2.build_vocab(test_data.question1, test_data.question2, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)
label.build_vocab(test_data, max_size = MAX_VOCAB_SIZE)

In [21]:
#length of vocabulary for test dataset (words + 1(unk) + 1(pad))
len(question2.vocab), len(question1.vocab)

(21130, 21130)

In [22]:
#10 first words of the vocabuary for test dataset
question2.vocab.itos[:10], question1.vocab.itos[:10]

(['<unk>', '<pad>', 'be', 'the', 'what', 'do', 'a', 'how', 'i', 'to'],
 ['<unk>', '<pad>', 'be', 'the', 'what', 'do', 'a', 'how', 'i', 'to'])

In [23]:
#the most common words of the vocabulary for test dataset 
question2.vocab.freqs.most_common(10), question1.vocab.freqs.most_common(10)

([('be', 40506),
  ('the', 29996),
  ('what', 25585),
  ('do', 19402),
  ('a', 18949),
  ('how', 17900),
  ('i', 17331),
  ('to', 16598),
  ('in', 15700),
  ('of', 12726)],
 [('be', 40506),
  ('the', 29996),
  ('what', 25585),
  ('do', 19402),
  ('a', 18949),
  ('how', 17900),
  ('i', 17331),
  ('to', 16598),
  ('in', 15700),
  ('of', 12726)])

In [0]:
#create batches from train and test data
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 64

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    sort = False, #don't sort test/validation data
    batch_size = BATCH_SIZE,
    device=device,
    shuffle=False)

In [25]:
#check it works
for batch in train_iterator:
    print(batch)
    break
batch.question2

 99%|█████████▉| 397926/400000 [00:29<00:00, 27843.88it/s]


[torchtext.data.batch.Batch of size 64]
	[.question1]:[torch.cuda.LongTensor of size 25x64 (GPU 0)]
	[.question2]:[torch.cuda.LongTensor of size 27x64 (GPU 0)]
	[.is_duplicate]:[torch.cuda.FloatTensor of size 64 (GPU 0)]


tensor([[   7,   40,   18,  ...,    4,   16,    7],
        [  13,   30,  577,  ...,    2,    5, 1966],
        [   8,    8,  210,  ...,   10,   91,   35],
        ...,
        [   1,    1,    8,  ...,    1,  158,    1],
        [   1,    1,    5,  ...,    1, 1437,    1],
        [   1,    1,  344,  ...,    1,    1,    1]], device='cuda:0')

In [26]:
#check it works
for batch in test_iterator:
    print(batch)
    break
batch.question1


[torchtext.data.batch.Batch of size 64]
	[.question1]:[torch.cuda.LongTensor of size 27x64 (GPU 0)]
	[.question2]:[torch.cuda.LongTensor of size 31x64 (GPU 0)]
	[.is_duplicate]:[torch.cuda.FloatTensor of size 64 (GPU 0)]


tensor([[   4,    5,    5,  ...,   39,   16,    4],
        [   2, 3701, 3488,  ...,    5,    5,    2],
        [  17,  142,  155,  ..., 7495,   41,    3],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')

In [27]:
batch.question1.shape, batch.question2.shape

(torch.Size([27, 64]), torch.Size([31, 64]))

In [0]:
#model
class RNN(nn.Module):
    def __init__(self, vocab_dim, text, hidden_dim=180, output_dim=2, dropout=0.2):
        super().__init__()
        embedding_dim = 100
        vocab = text.vocab
        self.embedding = nn.Embedding(vocab_dim, embedding_dim)
        self.embedding.weight.data.copy_(vocab.vectors)
        self.rnn = torch.nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, dropout=dropout, bidirectional=True)
        self.hidden_dim = hidden_dim        
        self.input_dim = 5 * 2 * self.hidden_dim
        self.classifier = nn.Sequential(
            nn.Linear(self.input_dim, int(self.input_dim/2)),
            nn.Linear(int(self.input_dim/2), output_dim))        
        
    def forward(self, question1, question2):

        #question = [sent len, batch size]        
        embedded1 = self.embedding(question1) 
        embedded2 = self.embedding(question2)
    
        #embedded = [sent len, batch size, emb dim]        
        output1, hidden1 = self.rnn(embedded1)   
        output2, hidden2 = self.rnn(embedded2)
        
        #output = [sent len, batch size, hid dim]
        features = torch.cat((output1[-1:,:,:],
                              torch.abs(output1[-1:,:,:] - output2[-1:,:,:]),
                              output2[-1:,:,:],
                              output1[-1:,:,:]*output2[-1:,:,:],
                              (output1[-1:,:,:]+output2[-1:,:,:])/2), 2)
        
        #output = [batch size, 5 * 2 * hidden dim]
        y = self.classifier(features)    
        return y.squeeze(0)

In [0]:
#define gpu
device = torch.device('cuda')

In [155]:
#model to gpu
model = RNN(vocab_dim=len(question1.vocab), text=question1)
model.to(device)
#model.load_state_dict(torch.load('model.pth'))

RNN(
  (embedding): Embedding(21130, 100)
  (rnn): LSTM(100, 180, num_layers=2, dropout=0.2, bidirectional=True)
  (classifier): Sequential(
    (0): Linear(in_features=1800, out_features=900, bias=True)
    (1): Linear(in_features=900, out_features=2, bias=True)
  )
)

In [156]:
#check dimensions of output
output = model(batch.question1.to(device), batch.question2.to(device))
output.shape

torch.Size([64, 2])

In [157]:
from sklearn import metrics
import warnings

warnings.filterwarnings("ignore")

epochs = 10
#criterion = torch.nn.BCEWithLogitsLoss()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1*10**(-3))
softmax = torch.nn.Softmax()

for epoch in range(1, epochs+1):
  loss_values = []
  auc_values = []
  for batch in train_iterator:
    output = model(batch.question1.to(device), batch.question2.to(device))
    loss = criterion(output, batch.is_duplicate.long().to(device))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step() 
    loss_values.append(loss.item()) 
  print(epoch, np.mean(loss_values))
  for batch in test_iterator: 
    output = model(batch.question1.to(device), batch.question2.to(device))
    #output = torch.sigmoid(output)
    #output = output > 0.5
    output = torch.argmax(softmax(output), dim=1)
    #accuracy = (output == batch.is_duplicate.to(device)).float().sum()
    auc = metrics.roc_auc_score(output.cpu().detach().numpy(), batch.is_duplicate.cpu().detach().numpy())
    auc_values.append(auc) 
  #print(epoch, torch.mean(torch.stack(auc_values)).item())
  print(epoch, np.mean(auc_values))

1 0.5000383206961522
1 0.7705182225967359
2 0.40913461094689807
2 0.7963615116132099
3 0.35817391190667236
3 0.8025286910410337
4 0.3147891534897332
4 0.8062315251684032
5 0.2752241421106832
5 0.8057576422071361
6 0.2402168107991901
6 0.8034861022921398
7 0.20881836901826895
7 0.803354312562561
8 0.18029428348125376
8 0.7988636112566776
9 0.1575569680628442
9 0.8052161598931018
10 0.1382584296491191
10 0.7995738080693123


In [0]:
#torch.save(model.state_dict(),"model.pth")