## Code

### Please find the 3 correct and 3 incorrect predictions at the bottom of this notebook. Thanks!

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter

import pickle as pkl
import random
import pdb
import io
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import pdb
import matplotlib
import matplotlib.pyplot as plt
import time

BATCH_SIZE = 128

In [2]:
device = torch.device("cuda:0")
print(device)

cuda:0


### Helper functions for each step in the pipeline

In [4]:
FastText = []
with open('wiki-news-300d-1M.vec', "r") as ft:
    for i, line in enumerate(ft):
        if i == 0:
            continue
        FastText.append(line)


In [5]:

import numpy as np

def build_embedding(data):    
    word2id = {"<pad>": 0, "<unk>": 1}
    id2word = {0: "<pad>", 1: "<unk>"}
    
    embeddings = [np.zeros(300),np.random.normal(0, 0.01, 300)]
    
    for i, line in enumerate(data):
        parsed = line.split()
        word = parsed[0]
        array = np.array([float(x) for x in parsed[1:]])
    
        word2id[word] = i+2
        id2word[i+2] = word
        embeddings.append(array)
        
    
    return word2id, id2word, embeddings
 
token2id, id2token, word_vectors = build_embedding(FastText)
BATCH_SIZE = 64
PAD_IDX = 0
UNK_IDX = 1


In [7]:
def convert_labels_to_integers(data_label):
    for i in range(len(data_label)):
        if data_label[i] == "contradiction":
            data_label[i] = 0
        elif data_label[i] == "entailment":
            data_label[i] = 1
        elif data_label[i] == "neutral":
            data_label[i] = 2
    return data_label

In [8]:
def verify_order(sent1_data, sent2_data, data_label):
    i = 2
    print(sent1_data[i])
    print(sent2_data[i])
    print(data_label[i])

In [9]:
# Word tokenize each entry in a list of sentences
def tokenize(sentence_list):
    return [word_tokenize(str(sentence_list[i])) for i in range(len(sentence_list))]

In [10]:
# "one-hot encode": convert each token to id in vocabulary vector (token2id)
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data 

### Creating vocabulary & embedding matrix from FastText

In [11]:
# word_vectors, token2id, id2token = build_vocab()

In [12]:
_weights = np.array(word_vectors)
_WEIGHTS = _weights
_WEIGHTS.shape

(999996, 300)

### Function to pre-process data for TwoSentenceModel
#### Shuffle, word tokenize, one-hot index into vocabulary

In [13]:
def data_pipeline(sent1s, sent2s, labels, verify=True):
    labels = convert_labels_to_integers(labels)
    # seed = random.randint(1, 100)
    # print("Random seed for shuffling: {}".format(seed))
    # random.Random(seed).shuffle(sent1s)
    # random.Random(seed).shuffle(sent2s)
    # random.Random(seed).shuffle(labels)
    
    print("\nVerifying that the data and label match after shuffling")
    print(sent1s[2])
    print(sent2s[2])
    if verify:
        verify_order(sent1s, sent2s, labels)
        verify_order(sent1s, sent2s, labels)
          
    print("\nTokenizing sentence 1 list...")    
    sent1s_tokenized = tokenize(sent1s)
    print("done!")
    print("\nTokenizing sentence 2 list... ")  
    sent2s_tokenized = tokenize(sent2s)
    print("done!")
    
    print("\nOne-hot encoding words for sentence 1 list...")  
    sent1s_indices = token2index_dataset(sent1s_tokenized)
    print("done!")
    print("\nOne-hot encoding words for sentence 2 list...")  
    sent2s_indices = token2index_dataset(sent2s_tokenized)
    print("done!")
    
    return (sent1s_indices, sent2s_indices, labels)

### DataLoader

In [14]:
# MAX_SENTENCE_LENGTH = 30

import numpy as np
import torch
from torch.utils.data import Dataset

class TwoSentencesDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    """
    
    def __init__(self, sent1_data_list, sent2_data_list, target_list):
        """
        @param sent1_data_list: list of sentence1's (index matches sentence2's and target_list below)
        @param sent2_data_list: list of sentence2's
        @param target_list: list of correct labels

        """
        self.sent1_data_list = sent1_data_list
        self.sent2_data_list = sent2_data_list
        self.target_list = target_list
        assert (len(self.sent1_data_list) == len(self.target_list) and len(self.sent2_data_list) == len(self.target_list))

    def __len__(self):
        return len(self.sent1_data_list)
        
    def __getitem__(self, key):
        ###
        ### Returns [[sentence, 1, tokens], [sentence, 2, tokens]]
        ###
        """
        Triggered when you call dataset[i]
        """
        sent1_tokens_idx = self.sent1_data_list[key][:MAX_SENTENCE_LENGTH]
        sent2_tokens_idx = self.sent2_data_list[key][:MAX_SENTENCE_LENGTH]
        combined_tokens_idx = [sent1_tokens_idx, sent2_tokens_idx]
        label = self.target_list[key]
        return [combined_tokens_idx, len(sent1_tokens_idx), len(sent2_tokens_idx), label]

def twosentences_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    sent1_data_list = []
    sent2_data_list = []
    sent1_length_list = []
    sent2_length_list = []
    label_list = []
    padded_vec = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[3])
        sent1_length_list.append(datum[1])
        sent2_length_list.append(datum[2])
    # padding
    for datum in batch:
        padded_vec.append(np.pad(np.concatenate((np.array(datum[0][0]),np.array(datum[0][1]))), pad_width=((0,2*MAX_SENTENCE_LENGTH-datum[1]-datum[2])), 
                                mode="constant", constant_values=0))
        
    return [torch.from_numpy(np.array(padded_vec)), 
            torch.LongTensor(sent1_length_list), torch.LongTensor(sent2_length_list), torch.LongTensor(label_list)]

### Train dataset creation

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
file_path = './QQP/questions.csv'
dataframe = pd.read_csv(file_path)
sentence1_list = dataframe['question1'].tolist()
sentence2_list = dataframe['question2'].tolist()
target_label_list = dataframe['is_duplicate'].tolist()

print("initial dataset size", len(sentence1_list))

# First split: train+val and test (10% of the original data for test set)
sentence1_train, sent1_test, sentence2_train, sent2_test, target_train, test_label = train_test_split(
    sentence1_list, sentence2_list, target_label_list, test_size=0.1, random_state=42)

# Second split: train and validation (10% of the original data for validation set, which is 1/9th of the remaining 90%)
sent1_data, sent1_val, sent2_data, sent2_val, data_label, val_label = train_test_split(
    sentence1_train, sentence2_train, target_train, test_size=0.1/0.9, random_state=42)

# Output the sizes of each set
print(f"Training set size: {len(sent1_data)}")
print(f"Validation set size: {len(sent1_val)}")
print(f"Test set size: {len(sent1_test)}")
print(type(val_label[40]))
print(sent2_val[40])
print(sent1_val[40])

initial dataset size 404351
Training set size: 323480
Validation set size: 40435
Test set size: 40436
<class 'int'>
What's the best customer service experience you've ever had?
What was your worst customer service experience you've ever had?


In [16]:
file_path = 'QQP_groundtruth_test.txt'

# Write the list to the file
with open(file_path, 'w') as file:
    for item in test_label:
        file.write(f"{item}\n")
print(f"List written to {file_path}")

List written to QQP_groundtruth_test.txt


In [17]:
sent1_train_indices, sent2_train_indices, train_label = data_pipeline(sent1_data, sent2_data, data_label)
train_dataset = TwoSentencesDataset(sent1_train_indices, sent2_train_indices, train_label)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE, 
                                           collate_fn=twosentences_collate_func,
                                           #shuffle=True
                                          )
print("Finished creating train_loader.")


Verifying that the data and label match after shuffling
What accomplishments did Hillary Clinton achieve during her time as Secretary of State?
What are Hillary Clinton's most recognized accomplishments while Secretary of State?
What accomplishments did Hillary Clinton achieve during her time as Secretary of State?
What are Hillary Clinton's most recognized accomplishments while Secretary of State?
1
What accomplishments did Hillary Clinton achieve during her time as Secretary of State?
What are Hillary Clinton's most recognized accomplishments while Secretary of State?
1

Tokenizing sentence 1 list...
done!

Tokenizing sentence 2 list... 
done!

One-hot encoding words for sentence 1 list...
done!

One-hot encoding words for sentence 2 list...
done!
Finished creating train_loader.


In [18]:
MAX_SENTENCE_LENGTH = max(max([len(sent) for sent in sent1_train_indices]), max([len(sent) for sent in sent2_train_indices]))
MAX_SENTENCE_LENGTH

272

### Val dataset creation

In [19]:
sent1_val_indices, sent2_val_indices, val_label = data_pipeline(sent1_val, sent2_val, val_label)
val_dataset = TwoSentencesDataset(sent1_val_indices, sent2_val_indices, val_label)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE, 
                                           collate_fn=twosentences_collate_func,
                                           #shuffle=True
                                          )


Verifying that the data and label match after shuffling
Are most people visual learners?
Why are some intelligent people not the fastest learners?
Are most people visual learners?
Why are some intelligent people not the fastest learners?
0
Are most people visual learners?
Why are some intelligent people not the fastest learners?
0

Tokenizing sentence 1 list...
done!

Tokenizing sentence 2 list... 
done!

One-hot encoding words for sentence 1 list...
done!

One-hot encoding words for sentence 2 list...
done!


In [20]:
sent1_test_indices, sent2_test_indices, test_label = data_pipeline(sent1_test, sent2_test, test_label)
test_dataset = TwoSentencesDataset(sent1_test_indices, sent2_test_indices, test_label)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE, 
                                           collate_fn=twosentences_collate_func,
                                           #shuffle=True
                                          )


Verifying that the data and label match after shuffling
What hotel in Jabalpur would be safe for unmarried couples, without the harassment of police, hotel staff, and moral police?
What hotel in Allahabad would be safe for unmarried couples, without the harassment of police, hotel staff, and moral police?
What hotel in Jabalpur would be safe for unmarried couples, without the harassment of police, hotel staff, and moral police?
What hotel in Allahabad would be safe for unmarried couples, without the harassment of police, hotel staff, and moral police?
0
What hotel in Jabalpur would be safe for unmarried couples, without the harassment of police, hotel staff, and moral police?
What hotel in Allahabad would be safe for unmarried couples, without the harassment of police, hotel staff, and moral police?
0

Tokenizing sentence 1 list...
done!

Tokenizing sentence 2 list... 
done!

One-hot encoding words for sentence 1 list...
done!

One-hot encoding words for sentence 2 list...
done!


### The Model

In [22]:
def save_embeddings_to_text_file(embeddings, output_file_path):
    with open(output_file_path, 'a') as file:
        for embedding in embeddings:
            for i,emb in enumerate(embedding):
                if i != len(embedding) - 1:
                    file.write(f'{emb} ')
                else:
                    file.write(f'{emb}\n')

In [24]:
class TwoSentenceModel(nn.Module):
    
    def __init__(self, hidden_size, num_layers, num_classes, emb_size = 300):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super(TwoSentenceModel, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        weight = torch.FloatTensor(_WEIGHTS)
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.rnn = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.linear1 = nn.Linear(2*hidden_size, 100)
        self.linear2 = nn.Linear(100, num_classes)
        self.dropout = nn.Dropout(0.2)

    def init_hidden(self, batch_size):
        return torch.randn(2, batch_size, self.hidden_size).to(device)


    def forward(self, x, sent1_lengths, sent2_lengths):
        # reset hidden state
        batch_size = x.size()[0]
        
        ordered_slengths = sent1_lengths + sent2_lengths
        
    
        reverse_sorted_lengths, reverse_sorted_indices = torch.sort(ordered_slengths, descending=True)
        # print((reverse_sorted_lengths).device)
        reverse_sorted_lengths = reverse_sorted_lengths.to(x.device)
        reverse_sorted_lengths = reverse_sorted_lengths.cpu().numpy()
        ordered_sents = x
        reverse_sorted_data = ordered_sents[reverse_sorted_indices].to(device)
        # get embedding
        embed = self.embedding(reverse_sorted_data)
        
        

        # pack padded sequence
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, reverse_sorted_lengths, batch_first=True)
        
        # print(embed.data.size())
        # print(embed.batch_sizes)
            
        self.hidden = self.init_hidden(batch_size)
        # fprop though RNN
        rnn_out, self.hidden = self.rnn(embed, self.hidden)
        
        ### MATCHING BACK
        
        change_back_indices = reverse_sorted_indices.argsort()
        # print(change_back_indices)
        self.hidden = self.hidden[:, change_back_indices]
              
        ### GRU stuff
        hidden_sents = torch.cat([self.hidden[0, :, :], self.hidden[1, :, :]], dim=1)
        save_embeddings_to_text_file(hidden_sents, './RNN_NDD/rnn_model_7_test_embeddings.txt')
        #need to change the file name to create embeddings after every epoch
        linear1 = self.linear1(hidden_sents)
        
        
        linear1 = F.relu(linear1.contiguous().view(-1, linear1.size(-1))).view(linear1.shape)   
        linear1 = self.dropout(linear1)
        logits = self.linear2(linear1)
        return logits

In [25]:
def gen_embed(loader, model):
    """
    Helper function that tests the model's performance on a dataset
    """
    model.eval()
    for (data, sent1_lengths, sent2_lengths, labels) in loader:
        data_batch, sent1_length_batch, sent2_length_batch, label_batch = data.to(device), sent1_lengths.to(device), sent2_lengths.to(device), labels.to(device)
        model(data_batch, sent1_length_batch, sent2_length_batch)
        



In [28]:
start = time.time()
model = TwoSentenceModel(emb_size = 300, hidden_size=300, num_layers=1, num_classes=2).to(device)
model.load_state_dict(torch.load("./RNN_NDD/rnn_model_7.pth"))
#change the model path to create embeddings after every epoch
gen_embed(test_loader, model)
end = time.time()
total_time = end - start
print("total time taken is ",total_time) 

total time taken is  447.2643554210663


Delete test embeddings file before creating test predictions. It appends test embeddings again.

In [29]:
model_test = TwoSentenceModel(emb_size = 300, hidden_size=300, num_layers=1, num_classes=2).to(device)
model.load_state_dict(torch.load("./RNN_NDD/BestModel/rnn_model.pth"))


<All keys matched successfully>

In [30]:
_, labels = test_model(test_loader, model_test)

In [31]:
print(len(labels))

40436


In [32]:
nested_list = labels
flat_list = [item for sublist in nested_list for item in sublist]
print(flat_list)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [33]:
file_path = './RNN_NDD/qqp_test_preds_corrected.txt'

# Write the list to the file
with open(file_path, 'w') as file:
    for item in flat_list:
        file.write(f"{item}\n")

print(f"List written to {file_path}")

List written to qqp_test_preds_corrected.txt
