# NLP Assignment1: Bag-of Words based Natural Language Inference
- Rui Jiang
- September, 2019
- Intro: train a Bag-Of-Words encoder to tackle the the Stanford Natural Language Inference (SNLI) and Multi-Genre Natural Lan- guage Inference (MNLI) task.

In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl

## 1. load dataset

In [51]:
!head -1 'data/snli_train.tsv'

sentence1	sentence2	label


In [52]:
!head -1 'data/mnli_train.tsv'

sentence1	sentence2	label	genre


In [53]:
snli_train = pd.read_table('data/snli_train.tsv')
snli_val = pd.read_table('data/snli_val.tsv')
mnli_train = pd.read_table('data/mnli_train.tsv')
mnli_val = pd.read_table('data/mnli_val.tsv')
print("SNLI Training Examples: "+str(len(snli_train)))
print("SNLI Validation Examples: "+str(len(snli_val)))
print("MNLI Training Examples: "+str(len(mnli_train)))
print("MNLI Validation Examples: "+str(len(mnli_val)))

SNLI Training Examples: 100000
SNLI Validation Examples: 1000
MNLI Training Examples: 20000
MNLI Validation Examples: 5000


In [54]:
# from sklearn.model_selection import train_test_split
# snli_train, snli_test = train_test_split(snli_train, test_size=0.2) #80000,20000
# mnli_train, mnli_test = train_test_split(mnli_train, test_size=0.2) #16000,4000
# # print("SNLI Testing Examples: "+str(len(snli_test)))
# # print("MNLI Testing Examples: "+str(len(mnli_test)))

In [55]:
# Random sample from train dataset
import random
print(snli_train.iloc[random.randint(0, len(snli_train) - 1)])

sentence1    Several people are behind a fence , watching a...
sentence2                             The people are outside .
label                                               entailment
Name: 34798, dtype: object


## 2. tokenizing dataset

- 1. use [spacy.io](https://spacy.io/)

In [56]:
import spacy
import string

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def spacy_tokenize(sent):
  tokens = tokenizer(sent)
  return [token.text.lower() for token in tokens if (token.text not in punctuations)]

def encode_label(x):
    if x == 'contradiction':
        return 0
    if x == 'entailment':
        return 1
    if x == 'neutral':
        return 2
    
def tokenize_dataset(dataset,istrain=False):
    """
    @param dataset: dataframe. 
    @param istrain: whether it's training dataset or not
    """
    dataset['sentence1'] = dataset.apply(lambda row:spacy_tokenize(row['sentence1']),axis=1)
    dataset['sentence2'] = dataset.apply(lambda row:spacy_tokenize(row['sentence2']),axis=1)
    dataset['label'] = dataset.apply(lambda row:encode_label(row['label']),axis=1)
    #get all_tokens from training set
    if istrain:
        l = (dataset['sentence1'].tolist()+dataset['sentence2'].tolist())
        all_tokens = [e for l1 in l for e in l1]
        return dataset,all_tokens
    else:
        return dataset

In [57]:
#tokenize train/val/test datasets
#val set tokens
print ("Tokenizing val data")
snli_val1 = tokenize_dataset(snli_val,istrain=False)
snli_val1.to_pickle("data/snli_val.p")

# train set tokens
print ("Tokenizing train data")
snli_train1, all_train_tokens = tokenize_dataset(snli_train,istrain=True)
snli_train1.to_pickle("data/snli_train.p")
pkl.dump(all_train_tokens, open("data/all_train_tokens.p", "wb"))

Tokenizing val data
Tokenizing train data


In [38]:
l = (snli_train['sentence1'].tolist()+snli_train['sentence2'].tolist())
all_train_tokens = [e for l1 in l for e in l1]
len(all_train_tokens)
pkl.dump(all_train_tokens, open("data/all_train_tokens.p", "wb"))

In [58]:
#for future use, load from pickle
snli_val = pkl.load(open("data/snli_val.p", "rb"))
snli_train = pkl.load(open("data/snli_train.p", "rb"))
all_train_tokens = pkl.load(open("data/all_train_tokens.p", "rb"))

In [36]:
snli_train.shape

(100000, 3)

In [59]:
# double checking
print ("SNLI Train dataset size is {}".format(len(snli_train)))
print ("SNLI Val dataset size is {}".format(len(snli_val)))
print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))
print ("Total number of *unique* tokens in train dataset is {}".format(len(set(all_train_tokens))))

SNLI Train dataset size is 100000
SNLI Val dataset size is 1000
Total number of tokens in train dataset is 2037507
Total number of *unique* tokens in train dataset is 19642


In [61]:
all_train_tokens[:5]

['a', 'young', 'girl', 'in', 'a']

In [44]:
snli_train.head()

Unnamed: 0,sentence1,sentence2,label
0,"[a, young, girl, in, a, pink, shirt, sitting, ...","[a, young, girl, watching, the, sunset, over, ...",2
1,"[a, woman, is, smiling, while, the, man, next,...","[two, people, are, next, to, each, other]",1
2,"[across, the, river, you, can, see, a, large, ...","[the, large, building, is, full, of, apartment...",2
3,"[a, man, in, white, shorts, and, a, black, shi...","[a, man, is, riding, a, jetski, on, the, ocean]",0
4,"[four, black, dogs, run, together, on, bright,...","[four, dogs, are, preparing, to, be, launched,...",0


## 3. Build Vocabulary
Now, we are going to create the vocabulary of most common 20,000 tokens in the training set. Remember that we will add special tokens `<unk>`(unknown) and `<pad>` to the vocabulary.

In [319]:
from collections import Counter

max_vocab_size = 7000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens,max_vocab_size):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens,max_vocab_size)

In [320]:
# Lets check the dictionary by loading random token from it
random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]
print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 2561 ; token proved
Token proved; token id 2561


In [321]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

snli_train_sent1_indices = token2index_dataset(snli_train['sentence1'])
snli_train_sent2_indices = token2index_dataset(snli_train['sentence2'])
snli_val_sent1_indices = token2index_dataset(snli_val['sentence1'])
snli_val_sent2_indices = token2index_dataset(snli_val['sentence2'])


# double checking
print ("Train dataset size is {}".format(len(snli_train_sent1_indices)))
print ("Val dataset size is {}".format(len(snli_val_sent2_indices)))

Train dataset size is 100000
Val dataset size is 1000


In [322]:
# visualize a random tokenized training example
rand_idx = random.randint(0, len(snli_train) - 1)
print(snli_train['sentence1'][rand_idx])
print(snli_train_sent1_indices[rand_idx])
print(snli_train['sentence2'][rand_idx])
print(snli_train_sent2_indices[rand_idx])

['woman', 'in', 'long', 'blue', 'jacket', 'talks', 'on', 'cellphone', 'in', 'front', 'of', 'large', 'window']
[538, 7, 151, 1135, 1, 3622, 17, 1, 7, 770, 3, 272, 1935]
['there', 'is', 'a', 'female', 'wearing', 'blue', 'jacket', 'busy', 'on', 'phone', 'standing', 'near', 'window']
[29, 11, 6, 1835, 2341, 1135, 1, 2502, 17, 1565, 1245, 530, 1935]


## 4. PyTorch DataLoader
- take in (x,y) give out minibatches
- set parameters: mini-batch size, max sentence length(number of words),如果不够用`<pad>`填充

In [323]:
MAX_SENTENCE_LENGTH = max([len(word) for word in snli_train_sent1_indices+snli_train_sent2_indices])
MAX_SENTENCE_LENGTH

78

In [324]:
# MAX_SENTENCE_LENGTH = 200
class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, sent1_list, sent2_list, label_list):

        self.sent1_list = sent1_list
        self.sent2_list = sent2_list
        self.label_list = label_list
        assert (len(self.sent1_list) == len(self.label_list))
        assert (len(self.sent2_list) == len(self.label_list))

    def __len__(self):
        return len(self.label_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx1 = self.sent1_list[key][:MAX_SENTENCE_LENGTH]
        token_idx2 = self.sent2_list[key][:MAX_SENTENCE_LENGTH]
        label = self.label_list[key]
        return [token_idx1, token_idx2, len(token_idx1), len(token_idx2), label]

In [325]:
train_dataset = SNLIDataset(snli_train_sent1_indices, snli_train_sent2_indices,snli_train['label'].tolist())

In [326]:
for i in range(5):
    print(train_dataset[i][0])
    print(train_dataset[i][1])
    print(train_dataset[i][2])
    print(train_dataset[i][3])
    print(train_dataset[i][4])

[6, 374, 930, 7, 6, 4628, 3671, 1457, 17, 6, 1, 5098, 6, 800, 3, 271]
[6, 374, 930, 1179, 2, 3731, 104, 2, 271]
16
9
2
[6, 538, 11, 5919, 163, 2, 152, 288, 4, 79, 11, 2415, 17, 6, 1135, 3919, 20, 6, 2205, 17, 10]
[96, 56, 16, 288, 4, 197, 75]
21
7
1
[432, 2, 610, 13, 45, 105, 6, 272, 390]
[2, 272, 390, 11, 424, 3, 2519, 5, 1]
9
9
2
[6, 152, 7, 241, 1, 5, 6, 436, 3671, 11, 1, 17, 2, 2188]
[6, 152, 11, 2983, 6, 1, 17, 2, 2188]
14
9
0
[316, 436, 2493, 331, 575, 17, 2112, 912, 1538]
[316, 2493, 16, 3852, 4, 23, 3779, 97, 1337]
9
9
0


We need a **collate function** so that when we have it in batches, all the sentences have the same length. We decide to keep a `MAX_SENTENCE_LENGTH` and if the sentence has fewer tokens, append the rest with zero and if the sentence has more tokens, chop it all at `MAX_SENTENCE_LENGTH`

In [327]:
def SNLI_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length: tell dataloader how to pad our sentences
    """
    sent1_list = []
    sent2_list = []
    label_list = []
    length1_list = []
    length2_list = []

    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[4])
        length1_list.append(datum[2])
        length2_list.append(datum[3])
    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[2])), 
                                mode="constant", constant_values=0)
        sent1_list.append(padded_vec1) # list of np.array
        padded_vec2 = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        sent2_list.append(padded_vec2)
        
    return [torch.from_numpy(np.array(sent1_list)),torch.from_numpy(np.array(sent2_list)),torch.LongTensor(length1_list), torch.LongTensor(length2_list),torch.LongTensor(label_list)]

In [328]:
BATCH_SIZE = 32

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=True)
val_dataset = SNLIDataset(snli_val_sent1_indices, snli_val_sent2_indices, snli_val['label'].tolist())
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=True)

In [329]:
### checking the first batch of data loader
for i, (sent1,sent2,length1,length2, labels) in enumerate(train_loader):
#     print(sent1)
    print(sent1.shape)
#     print(sent2)
    print(sent2.shape)
    print((torch.cat((sent1, sent2), 1)).shape)
    print(length1)
    print(length2)
    print(labels)
    break

torch.Size([32, 78])
torch.Size([32, 78])
torch.Size([32, 156])
tensor([11, 23, 15,  5, 19, 14, 22, 17, 10, 16, 15, 14, 14,  9, 10, 14, 11, 15,
         7,  8,  9,  7, 14, 17, 17, 12,  7, 11,  8, 16, 14, 10])
tensor([10,  8,  5,  2, 14, 16,  9, 13,  4, 16,  5,  8, 13,  8,  5,  7,  7,  7,
         7,  7,  8,  7, 14,  4, 11,  7, 11,  7,  6,  7, 13,  5])
tensor([2, 2, 0, 1, 1, 2, 0, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 2, 2, 0, 1, 2, 0, 2,
        0, 0, 2, 1, 2, 1, 0, 1])


## 5. Build Neural Network
- A fully connected neural network with two hidden layers

__my note:__  
- torch.nn.Embedding: which takes two arguments: the vocabulary size (|V|), and the dimensionality of the embeddings (D).
- PyTorch embeddings are stored as matrix: $|V|\times|D|$(emb_dim)
- after embedding, give out: max_sentence_length(78) $\times$ emb_dim
- sum the 78 words in every dimension, gives out 100(emb_dim) dimension
- divided by the actual number of words in a sentence(i.e. disregard those padding words)

In [354]:
class NNModel(nn.Module):
    """
    NN classification model
    """
    def __init__(self, vocab_size, emb_dim, hidden_num1,hidden_num2):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(NNModel, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.nn_layer = nn.Sequential(
                        nn.Linear(2*emb_dim,hidden_num1),
                        nn.ReLU(),
                        nn.Linear(hidden_num1,hidden_num2),
                        nn.ReLU(),
                        nn.Linear(hidden_num2,3))
    
    def forward(self, sent1, sent2, length1, length2):
        """
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out1 = self.embed(sent1)
        out2 = self.embed(sent2)
        out1 = torch.sum(out1, dim=1)
        out1 /= length1.view(length1.size()[0],1).expand_as(out1).float()
        out2 = torch.sum(out2, dim=1)
        out2 /= length1.view(length2.size()[0],1).expand_as(out2).float()
        
        # return logits
        out = self.nn_layer(torch.cat((out1, out2), 1))
#         out = self.nn_layer(torch.mul(out1, out2))
        return out

In [355]:
emb_dim = 30
model = NNModel(len(id2token),emb_dim,30,30)

In [356]:
model.embed.weight.shape

torch.Size([7002, 30])

In [357]:
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  

learning_rate = 0.005
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [358]:
for x in model.parameters():
    print(x.shape)

torch.Size([7002, 30])
torch.Size([30, 60])
torch.Size([30])
torch.Size([30, 30])
torch.Size([30])
torch.Size([3, 30])
torch.Size([3])


In [359]:
num_epochs = 5 # number epoch to train

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for sent1,sent2,length1,length2, labels in loader:
        outputs = F.softmax(model(sent1,sent2,length1,length2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

for epoch in range(num_epochs):
    for i, (sent1,sent2,length1,length2, labels) in enumerate(train_loader):
        model.train()
        
        optimizer.zero_grad()
        outputs = model(sent1, sent2, length1, length2)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step() #step() method: updates the parameters,can be called once the gradients are computed using e.g. backward().
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Loss: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc, loss))

Epoch: [1/5], Step: [101/3125], Validation Acc: 42.8, Loss: 1.0661572217941284
Epoch: [1/5], Step: [201/3125], Validation Acc: 44.6, Loss: 1.0641974210739136
Epoch: [1/5], Step: [301/3125], Validation Acc: 47.4, Loss: 0.9687519669532776
Epoch: [1/5], Step: [401/3125], Validation Acc: 50.3, Loss: 1.0060420036315918
Epoch: [1/5], Step: [501/3125], Validation Acc: 50.2, Loss: 0.7982286810874939
Epoch: [1/5], Step: [601/3125], Validation Acc: 52.9, Loss: 0.8705828785896301
Epoch: [1/5], Step: [701/3125], Validation Acc: 54.8, Loss: 1.0392117500305176
Epoch: [1/5], Step: [801/3125], Validation Acc: 55.0, Loss: 1.1201326847076416
Epoch: [1/5], Step: [901/3125], Validation Acc: 55.9, Loss: 0.8922139406204224
Epoch: [1/5], Step: [1001/3125], Validation Acc: 55.2, Loss: 0.9035502672195435
Epoch: [1/5], Step: [1101/3125], Validation Acc: 56.3, Loss: 0.8844563364982605
Epoch: [1/5], Step: [1201/3125], Validation Acc: 55.7, Loss: 0.8751363754272461
Epoch: [1/5], Step: [1301/3125], Validation Acc: 

KeyboardInterrupt: 

## Hyperparameters tuning
- Varying the size of the vocabulary and embedding dimensions
- Experiment with different ways of interacting the two encoded sentences
(concatenation, element-wise multiplication, etc) (This is required)

In [None]:
max_vocab_size_list = [8000,10000,12000]
token2id, id2token = build_vocab(all_train_tokens,max_vocab_size)
snli_train_sent1_indices = token2index_dataset(snli_train['sentence1'])
snli_train_sent2_indices = token2index_dataset(snli_train['sentence2'])
snli_val_sent1_indices = token2index_dataset(snli_val['sentence1'])
snli_val_sent2_indices = token2index_dataset(snli_val['sentence2'])

# MNLI

### 1. tokenize

In [242]:
!head -1 'data/mnli_train.tsv'

sentence1	sentence2	label	genre


In [240]:
!wc -l "data/mnli_train.tsv"

   20001 data/mnli_train.tsv


In [244]:
mnli_train.genre.unique()

array(['telephone', 'fiction', 'slate', 'government', 'travel'],
      dtype=object)

In [277]:
mnli_train.genre.value_counts()

telephone     4270
slate         4026
travel        3985
government    3883
fiction       3836
Name: genre, dtype: int64

In [278]:
mnli_val.genre.value_counts()

government    1016
telephone     1005
slate         1002
fiction        995
travel         982
Name: genre, dtype: int64

In [247]:
#tokenize train/val datasets
#val set tokens
print ("Tokenizing val data")
mnli_val1 = tokenize_dataset(mnli_val,istrain=False)
mnli_val1.to_pickle("data/mnli_val.p")

# train set tokens
print ("Tokenizing train data")
mnli_train1, all_train_tokens = tokenize_dataset(mnli_train,istrain=True)
mnli_train1.to_pickle("data/mnli_train.p")
pkl.dump(all_train_tokens, open("data/all_train_tokens_mnli.p", "wb"))

Tokenizing val data
Tokenizing train data


In [248]:
mnli_val = pkl.load(open("data/mnli_val.p", "rb"))
mnli_train = pkl.load(open("data/mnli_train.p", "rb"))
mnli_train.head()

Unnamed: 0,sentence1,sentence2,label,genre
0,"[and, now, that, was, in, fifty, one, that, 's...","[it, was, already, a, problem, forty, years, a...",2,telephone
1,"[jon, could, smell, baked, bread, on, the, air...","[jon, smelt, food, in, the, air, and, was, hun...",2,fiction
2,"[it, will, be, like, italian, basketball, with...","[this, type, of, italian, basketball, is, noth...",0,telephone
3,"[well, i, think, that, 's, about, uh, that, 's...","[sorry, but, we, are, not, done, just, yet]",0,telephone
4,"[good, job, tenure, that, is, --, because, in,...","[dr., quinn, medicine, woman, was, worked, on,...",1,slate


In [295]:
mnli_train[mnli_train['genre']=='fiction'].reset_index(drop=True)

Unnamed: 0,sentence1,sentence2,label,genre
0,"[jon, could, smell, baked, bread, on, the, air...","[jon, smelt, food, in, the, air, and, was, hun...",2,fiction
1,"[jon, turned, to, adrin, vrenna, and, san'doro]","[jon, walked, away, without, acknowledging, th...",0,fiction
2,"[slowly, tommy, spoke]","[tommy, did, not, talk, quickly]",1,fiction
3,"[he, far, underestimated, the, boy]","[he, underestimated, how, good, of, a, cook, t...",2,fiction
4,"[still, connected, up, to, the, polygraph, mac...","[i, did, n't, care, what, the, polygraph, showed]",0,fiction
...,...,...,...,...
3831,"[sharp, spikes, lined, the, pit, aiming, both,...","[the, pit, was, lined, with, smooth, dirt]",0,fiction
3832,"[he, motioned, to, tommy, to, sit, down, oppos...","[he, told, tommy, to, remain, standing]",0,fiction
3833,"[he, fell, back, his, leg, twisting, at, a, wr...","[he, was, not, being, careful]",2,fiction
3834,"[he, was, good, better, than, jon, let, on, sa...","[the, kal, said, that, he, was, better, than, ...",1,fiction


In [296]:
for i in mnli_train.genre.unique():
    mnli_train[mnli_train['genre']==i].reset_index(drop=True).to_pickle('data/MNLI/mnli_train_'+i+'.p')

In [297]:
for i in mnli_val.genre.unique():
    mnli_val[mnli_val['genre']==i].reset_index(drop=True).to_pickle('data/MNLI/mnli_val_'+i+'.p')

-  load mnli tokenized data

In [337]:
mnli_train_telephone = pkl.load(open("data/MNLI/mnli_train_telephone.p", "rb"))
mnli_train_fiction = pkl.load(open("data/MNLI/mnli_train_fiction.p", "rb"))
mnli_train_slate = pkl.load(open("data/MNLI/mnli_train_slate.p", "rb"))
mnli_train_government = pkl.load(open("data/MNLI/mnli_train_government.p", "rb"))
mnli_train_travel = pkl.load(open("data/MNLI/mnli_train_travel.p", "rb"))
mnli_val_telephone = pkl.load(open("data/MNLI/mnli_val_telephone.p", "rb"))
mnli_val_fiction = pkl.load(open("data/MNLI/mnli_val_fiction.p", "rb"))
mnli_val_slate = pkl.load(open("data/MNLI/mnli_val_slate.p", "rb"))
mnli_val_government = pkl.load(open("data/MNLI/mnli_val_government.p", "rb"))
mnli_val_travel = pkl.load(open("data/MNLI/mnli_val_travel.p", "rb"))

- tokens to id

In [338]:
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

mnli_train_telephone_sent1_indices = token2index_dataset(mnli_train_telephone['sentence1'])
mnli_train_telephone_sent2_indices = token2index_dataset(mnli_train_telephone['sentence2'])
mnli_val_telephone_sent1_indices = token2index_dataset(mnli_val_telephone['sentence1'])
mnli_val_telephone_sent2_indices = token2index_dataset(mnli_val_telephone['sentence2'])

In [339]:
# visualize a random tokenized training example
rand_idx = random.randint(0, len(mnli_train_telephone) - 1)
print(mnli_train_telephone['sentence1'][rand_idx])
print(mnli_train_telephone_sent1_indices[rand_idx])
print(mnli_train_telephone['sentence2'][rand_idx])
print(mnli_train_telephone_sent2_indices[rand_idx])

['hop', 'skip', 'and', 'a', 'jump', 'there', 'you', 'go', 'well', 'that', "'s", 'good', 'well', 'are', 'you', 'uh', 'do', 'you', 'prefer', 'the', 'kind', 'of', 'weather', 'that', 'you', "'re", 'getting', 'in', 'dallas', 'over', 'your', 'years', 'in', 'new', 'hampshire', 'or', 'do', 'you', 'miss', 'the', 'winters', 'or']
[1, 1, 5, 6, 4040, 29, 13, 103, 64, 9, 12, 82, 64, 16, 13, 33, 25, 13, 1419, 2, 186, 3, 593, 9, 13, 77, 326, 7, 1018, 104, 84, 116, 7, 88, 5539, 37, 25, 13, 747, 2, 6344, 37]
['why', 'have', 'you', 'never', 'been', 'to', 'new', 'hampshire']
[242, 22, 13, 108, 61, 4, 88, 5539]


In [340]:
MAX_SENTENCE_LENGTH = max(max(mnli_train_telephone['sentence1'].apply(len)),max(mnli_train_telephone['sentence2'].apply(len)))
MAX_SENTENCE_LENGTH

242

In [341]:
# MAX_SENTENCE_LENGTH = 200
class MNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, sent1_list, sent2_list, label_list):

        self.sent1_list = sent1_list
        self.sent2_list = sent2_list
        self.label_list = label_list
        assert (len(self.sent1_list) == len(self.label_list))
        assert (len(self.sent2_list) == len(self.label_list))

    def __len__(self):
        return len(self.label_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx1 = self.sent1_list[key][:MAX_SENTENCE_LENGTH]
        token_idx2 = self.sent2_list[key][:MAX_SENTENCE_LENGTH]
        label = self.label_list[key]
        return [token_idx1, token_idx2, len(token_idx1), len(token_idx2), label]


In [342]:
mnli_telephone_train_dataset = MNLIDataset(mnli_train_telephone_sent1_indices, mnli_train_telephone_sent2_indices,mnli_train_telephone['label'].tolist())

In [343]:
def MNLI_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length: tell dataloader how to pad our sentences
    """
    sent1_list = []
    sent2_list = []
    label_list = []
    length1_list = []
    length2_list = []

    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[4])
        length1_list.append(datum[2])
        length2_list.append(datum[3])
    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[2])), 
                                mode="constant", constant_values=0)
        sent1_list.append(padded_vec1) # list of np.array
        padded_vec2 = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        sent2_list.append(padded_vec2)
        
    return [torch.from_numpy(np.array(sent1_list)),torch.from_numpy(np.array(sent2_list)),torch.LongTensor(length1_list), torch.LongTensor(length2_list),torch.LongTensor(label_list)]



In [344]:
BATCH_SIZE = 32
mnli_telephone_train_loader = torch.utils.data.DataLoader(dataset=mnli_telephone_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=MNLI_collate_func,
                                           shuffle=True)
mnli_telephone_val_dataset = SNLIDataset(snli_val_sent1_indices, snli_val_sent2_indices, snli_val['label'].tolist())
mnli_telephone_val_loader = torch.utils.data.DataLoader(dataset=mnli_telephone_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=MNLI_collate_func,
                                           shuffle=True)

In [345]:
### checking the first batch of data loader
for i, (sent1,sent2,length1,length2, labels) in enumerate(mnli_telephone_train_loader):
#     print(sent1)
    print(sent1.shape)
#     print(sent2)
    print(sent2.shape)
    print((torch.cat((sent1, sent2), 1)).shape)
    print(length1)
    print(length2)
    print(labels)
    break

torch.Size([32, 242])
torch.Size([32, 242])
torch.Size([32, 484])
tensor([16, 54,  8, 36, 27, 22, 28, 28, 22, 27, 11,  9, 13, 44, 94, 47, 16, 31,
        19, 37, 32, 52, 39, 30, 26, 49, 35, 27, 14,  7,  9, 50])
tensor([13,  6,  3, 27, 11,  5,  6, 14, 15, 13, 11,  6, 12, 14,  6, 11,  7, 18,
         8, 11, 13,  7, 15,  8,  9,  5, 12, 13, 10,  3, 13,  5])
tensor([0, 1, 0, 1, 2, 2, 2, 1, 1, 1, 2, 1, 0, 0, 1, 1, 1, 1, 2, 0, 0, 1, 1, 0,
        0, 2, 2, 2, 0, 0, 2, 0])


## 3.3 Finetuneing on MNLI

In [385]:
# load the best NN model
best = pkl.load(open("snli_best_model.pkl", "rb"))
best['info']

{'vocabulary_size': 7000,
 'embedding_dimension': 30,
 'hidden_dimension1': 30,
 'hidden_dimension2': 30,
 'sentences_interaction': 'concatenate',
 'optimizer': 'Adam',
 'learning_rate': 0.01,
 'lr_desc_per_ep': 0.0009,
 'weight_decay': 1e-06,
 'drop_out': 0.05,
 'max_sentence_length': 50,
 'best_val_acc': 68.7,
 '@epoch': 8,
 'num_epochs': 10,
 '@step': 101,
 'total_steps': 3125,
 '@train_loss': 0.483,
 'time_per_epoch(min)': 5.336699211597443}

In [382]:
best_model = best['model']
best_model.embed.weight.shape

torch.Size([7002, 30])

In [383]:
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
learning_rate = 0.0005
optimizer = torch.optim.Adam(best_model.parameters(), lr=learning_rate)

In [384]:
num_epochs = 10 # number epoch to train

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for sent1,sent2,length1,length2, labels in loader:
        outputs = F.softmax(model(sent1,length1,sent2,length2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


for epoch in range(num_epochs):
    for i, (sent1,sent2,length1,length2, labels) in enumerate(mnli_telephone_train_loader):
        best_model.train()
        optimizer.zero_grad()
        outputs = best_model(sent1, length1, sent2, length2)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(mnli_telephone_val_loader, best_model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Loss: {}'.format( 
                       epoch+1, num_epochs, i+1, len(mnli_telephone_train_loader), val_acc, loss))
                       


Epoch: [1/10], Step: [101/134], Validation Acc: 37.6, Loss: 1.8245795965194702
Epoch: [2/10], Step: [101/134], Validation Acc: 38.0, Loss: 1.348197340965271
Epoch: [3/10], Step: [101/134], Validation Acc: 37.4, Loss: 1.1927039623260498
Epoch: [4/10], Step: [101/134], Validation Acc: 37.7, Loss: 1.2358551025390625
Epoch: [5/10], Step: [101/134], Validation Acc: 37.1, Loss: 1.0580377578735352
Epoch: [6/10], Step: [101/134], Validation Acc: 36.4, Loss: 1.0513455867767334
Epoch: [7/10], Step: [101/134], Validation Acc: 36.7, Loss: 1.0686239004135132
Epoch: [8/10], Step: [101/134], Validation Acc: 37.0, Loss: 1.0709761381149292
Epoch: [9/10], Step: [101/134], Validation Acc: 36.4, Loss: 1.111294150352478
Epoch: [10/10], Step: [101/134], Validation Acc: 36.9, Loss: 0.9979006052017212


In [376]:
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  

learning_rate = 0.005
optimizer = torch.optim.Adam(best_model.parameters(), lr=0.01, weight_decay=0.000001)

In [377]:
"""
python snli_save_best_model.py --model-file ./snil_best_model.pkl --h1-dim 30 --h2-dim 30 --max-vocab-size 7000 --ebd-dim 30 --sent-interaction concatenate --optimizer Adam --lr 0.01 --lr-desc-per-ep 0.0009 --epochs 10 --w-dec 0.000001 --dropout 0.05 --batch-size 32"""
num_epochs = 5 # number epoch to train

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for sent1,sent2,length1,length2, labels in loader:
        outputs = F.softmax(model(sent1,length1,sent2,length2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

for epoch in range(num_epochs):
    for i, (sent1,sent2,length1,length2, labels) in enumerate(train_loader):
        best_model.train()
        
        optimizer.zero_grad()
        outputs = best_model(sent1, length1, sent2, length2)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, best_model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Loss: {}'.format( 
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc, loss))

Epoch: [1/5], Step: [101/3125], Validation Acc: 35.1, Loss: 1.116905689239502
Epoch: [1/5], Step: [201/3125], Validation Acc: 33.6, Loss: 1.1030097007751465
Epoch: [1/5], Step: [301/3125], Validation Acc: 35.2, Loss: 1.114428997039795
Epoch: [1/5], Step: [401/3125], Validation Acc: 36.1, Loss: 1.1081569194793701
Epoch: [1/5], Step: [501/3125], Validation Acc: 38.9, Loss: 1.0869009494781494
Epoch: [1/5], Step: [601/3125], Validation Acc: 38.5, Loss: 1.105239987373352
Epoch: [1/5], Step: [701/3125], Validation Acc: 38.5, Loss: 1.0369211435317993
Epoch: [1/5], Step: [801/3125], Validation Acc: 44.0, Loss: 1.0427151918411255
Epoch: [1/5], Step: [901/3125], Validation Acc: 44.6, Loss: 1.0304932594299316
Epoch: [1/5], Step: [1001/3125], Validation Acc: 49.8, Loss: 1.1065162420272827
Epoch: [1/5], Step: [1101/3125], Validation Acc: 54.8, Loss: 1.0114941596984863
Epoch: [1/5], Step: [1201/3125], Validation Acc: 56.0, Loss: 0.9452487230300903
Epoch: [1/5], Step: [1301/3125], Validation Acc: 54.

Epoch: [4/5], Step: [1201/3125], Validation Acc: 64.5, Loss: 0.5564201474189758
Epoch: [4/5], Step: [1301/3125], Validation Acc: 62.9, Loss: 0.6677322387695312
Epoch: [4/5], Step: [1401/3125], Validation Acc: 63.2, Loss: 0.7339513897895813
Epoch: [4/5], Step: [1501/3125], Validation Acc: 62.0, Loss: 0.8194581866264343
Epoch: [4/5], Step: [1601/3125], Validation Acc: 62.3, Loss: 0.7582756280899048
Epoch: [4/5], Step: [1701/3125], Validation Acc: 61.7, Loss: 0.6723196506500244
Epoch: [4/5], Step: [1801/3125], Validation Acc: 63.2, Loss: 0.6262174248695374
Epoch: [4/5], Step: [1901/3125], Validation Acc: 63.8, Loss: 0.6238638162612915
Epoch: [4/5], Step: [2001/3125], Validation Acc: 63.9, Loss: 0.6553499102592468
Epoch: [4/5], Step: [2101/3125], Validation Acc: 62.9, Loss: 0.7499513030052185
Epoch: [4/5], Step: [2201/3125], Validation Acc: 64.4, Loss: 0.7213110327720642
Epoch: [4/5], Step: [2301/3125], Validation Acc: 64.2, Loss: 0.7678834795951843
Epoch: [4/5], Step: [2401/3125], Validat

# 3.4

In [8]:
with open('data/wiki-news-300d-1M.vec') as f:
    for i, line in enumerate(f):
        if i > 1:
            break
        if i > 0:
            print(len(line.split()))
        print(line.split())

['999994', '300']
301
[',', '0.1073', '0.0089', '0.0006', '0.0055', '-0.0646', '-0.0600', '0.0450', '-0.0133', '-0.0357', '0.0430', '-0.0356', '-0.0032', '0.0073', '-0.0001', '0.0258', '-0.0166', '0.0075', '0.0686', '0.0392', '0.0753', '0.0115', '-0.0087', '0.0421', '0.0265', '-0.0601', '0.2420', '0.0199', '-0.0739', '-0.0031', '-0.0263', '-0.0062', '0.0168', '-0.0357', '-0.0249', '0.0190', '-0.0184', '-0.0537', '0.1420', '0.0600', '0.0226', '-0.0038', '-0.0675', '-0.0036', '-0.0080', '0.0570', '0.0208', '0.0223', '-0.0256', '-0.0153', '0.0022', '-0.0482', '0.0131', '-0.6016', '-0.0088', '0.0106', '0.0229', '0.0336', '0.0071', '0.0887', '0.0237', '-0.0290', '-0.0405', '-0.0125', '0.0147', '0.0475', '0.0647', '0.0474', '0.0199', '0.0408', '0.0322', '0.0036', '0.0350', '-0.0723', '-0.0305', '0.0184', '-0.0026', '0.0240', '-0.0160', '-0.0308', '0.0434', '0.0147', '-0.0457', '-0.0267', '-0.1703', '-0.0099', '0.0417', '0.0235', '-0.0260', '-0.1519', '-0.0116', '-0.0306', '-0.0413', '0.0330'

In [26]:
import io

def load_pretrained_ebd(fname,max_vocab_size):
    with open('data/wiki-news-300d-1M.vec') as f:
        word_ebd = np.zeros((max_vocab_size+2, 300))
        token2id = {}
        id2token = {}
        
        for i, line in enumerate(f):
            if i ==0:
                continue
            if i >= max_vocab_size: 
                break
            l = line.split()
            word_ebd[i+1, :] = np.asarray(l[1:]) #0,1 reserved for pad and unk
            token2id[l[0]] = i+1
            id2token[i+1] = l[0]
    return word_ebd, token2id, id2token

MAX_VOCAB_SIZE = 50000
word_ebd, token2id,id2token = load_pretrained_ebd('data/wiki-news-300d-1M.vec',MAX_VOCAB_SIZE)
word_ebd[1] = np.random.rand(300,) # wait to be trained
token2id['PAD']=0
token2id['UNK']=1
id2token[0]='PAD'
id2token[1]='UNK'

(300,)

In [27]:
#############################
######## test ##############
sent1 = torch.tensor([32,1,2,32,0,1])
mask = (sent1==1)
pretrained_sent1 = sent1.clone()
pretrained_sent1[mask] = 0 #assign unk vector to 0
fixed_embed = nn.Embedding.from_pretrained(torch.FloatTensor(word_ebd),freeze=True) #freeze those that are not unk
trainable_embed = nn.Embedding(1, 300)

print('sent1\n',sent1)



print('pretrained_sent1:\n',pretrained_sent1)
out1 = fixed_embed(pretrained_sent1) #embed pretrained 
print('out1:fixed_embed\n',out1)

sent1 -= 1
print('sent1-32\n',sent1)
sent1[~mask] = 0  #把不为32的都标记为0，i.e. mark all that are unk as 1
print('sent1~mask\n',sent1)


trainable_embedded_sent1 = trainable_embed(sent1)
out1[mask] = trainable_embedded_sent1[mask]
print('trainable_embedded_sent1:\n',trainable_embedded_sent1)
print('out1\n',out1)

sent1
 tensor([32,  1,  2, 32,  0,  1])
pretrained_sent1:
 tensor([32,  0,  2, 32,  0,  0])
out1:fixed_embed
 tensor([[-0.1202,  0.0700,  0.1030,  ...,  0.3267, -0.0606,  0.0025],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1073,  0.0089,  0.0006,  ...,  0.0050,  0.1173, -0.0400],
        [-0.1202,  0.0700,  0.1030,  ...,  0.3267, -0.0606,  0.0025],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])
sent1-32
 tensor([31,  0,  1, 31, -1,  0])
sent1~mask
 tensor([0, 0, 0, 0, 0, 0])
trainable_embedded_sent1:
 tensor([[-0.9479, -1.2583, -0.3336,  ...,  0.8607, -0.1911,  0.6727],
        [-0.9479, -1.2583, -0.3336,  ...,  0.8607, -0.1911,  0.6727],
        [-0.9479, -1.2583, -0.3336,  ...,  0.8607, -0.1911,  0.6727],
        [-0.9479, -1.2583, -0.3336,  ...,  0.8607, -0.1911,  0.6727],
        [-0.9479, -1.2583, -0.3336,  ...,  0.8607, -0.1911,  0.6727],
        [-