In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('hw2_data/snli_train.tsv',delimiter='\t')
val = pd.read_csv("hw2_data/snli_val.tsv",delimiter='\t')
test = pd.read_csv("hw2_data/mnli_val.tsv",delimiter='\t')

In [3]:
train_data = train[['sentence1','sentence2']]
train_targets = np.array(train['label'])

val_data = val[['sentence1','sentence2']]
val_targets = np.array(val['label'])

test_data = test[['sentence1','sentence2','genre']]
test_targets = np.array(test['label'])


In [4]:
len(val_targets)

1000

In [5]:
train_targets = np.where(train_targets=='entailment',1,train_targets)
train_targets = np.where(train_targets=='neutral',0,train_targets)
train_targets = np.where(train_targets=='contradiction',2,train_targets)


val_targets = np.where(val_targets=='entailment',1,val_targets)
val_targets = np.where(val_targets=='neutral',0,val_targets)
val_targets = np.where(val_targets=='contradiction',2,val_targets)

test_targets = np.where(test_targets=='entailment',1,test_targets)
test_targets = np.where(test_targets=='neutral',0,test_targets)
test_targets = np.where(test_targets=='contradiction',2,test_targets)



In [6]:
len(val_targets)

1000

In [7]:
ft_home = './'
words_to_load = 50000

# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1


with open(ft_home + 'wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft = np.zeros((words_to_load+2, 300))
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = ['<pad>','<unk>']
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
        words_ft[s[0]] = i+2
        idx2words_ft[i+2] = s[0]
        ordered_words_ft.append(s[0])
    words_ft['<pad>'] = PAD_IDX 
    words_ft['<unk>'] = UNK_IDX   
    

In [8]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [words_ft[token] if token in words_ft else UNK_IDX for token in tokens.split(' ')]
        indices_data.append(index_list)
    return indices_data

train_data_sentence1_indices = token2index_dataset(train_data['sentence1'])
train_data_sentence2_indices = token2index_dataset(train_data['sentence2'])

val_data_sentence1_indices = token2index_dataset(val_data['sentence1'])
val_data_sentence2_indices = token2index_dataset(val_data['sentence2'])

# double checking
print ("Train dataset of sentence1 size is {}".format(len(train_data_sentence1_indices)))
print ("Train dataset of sentence2 size is {}".format(len(train_data_sentence2_indices)))


Train dataset of sentence1 size is 100000
Train dataset of sentence2 size is 100000


In [9]:
#MAX_SENTENCE_LENGTH = 80
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list_1,data_list_2, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list_1 = data_list_1
        self.data_list_2 = data_list_2
        self.target_list = target_list
        assert (len(self.data_list_1) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """

        token_idx_1 = self.data_list_1[key]
        token_idx_2 = self.data_list_2[key]
        label = self.target_list[key]
        return [token_idx_1,token_idx_2, len(token_idx_1),len(token_idx_2), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list_1 = []
    data_list_2 = []
    label_list = []
    length_list_1 = []
    length_list_2 = []

    for datum in batch:
        label_list.append(datum[4])
        length_list_1.append(datum[2])
        #print(datum[2])
        length_list_2.append(datum[3])
        #print(datum[3])

    # padding
    for datum in batch:
        
        max_s1 = max (length_list_1) 
        max_s2 = max (length_list_2) 


        padded_vec_1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,max_s1-datum[2])), 
                                mode="constant", constant_values=0)
        data_list_1.append(padded_vec_1)
        
        #print(data_list_1[0])
        
        padded_vec_2 = np.pad(np.array(datum[1]), 
                                pad_width=((0,max_s2-datum[3])), 
                                mode="constant", constant_values=0)
        data_list_2.append(padded_vec_2)
        #print(data_list_2[0])
        
    return [torch.from_numpy(np.array(data_list_1)),torch.from_numpy(np.array(data_list_2)),torch.from_numpy(np.array(length_list_1)), torch.LongTensor(length_list_2),torch.LongTensor(label_list)]

BATCH_SIZE = 100
train_dataset = NewsGroupDataset(train_data_sentence1_indices,train_data_sentence2_indices, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(val_data_sentence1_indices,val_data_sentence2_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)



In [10]:

# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F


class SNLI_CNN(nn.Module):
    def __init__(self,embeddings, emb_size, hidden_size, num_layers, num_classes, vocab_size=50000,kernel_size=2,FC_hiden=50):

        super(SNLI_CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.num_classes = num_classes
        self.emb_size = emb_size
        
        self.embed = nn.Embedding.from_pretrained(torch.from_numpy(embeddings).float(), freeze=False, sparse=False)
        
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size, padding=1)

                
        self.linear = nn.Sequential(
            nn.Linear(hidden_size*2,FC_hiden),
            nn.ReLU(),
            nn.Linear(FC_hiden,num_classes),
            nn.ReLU())     
        
    def forward(self, data_1,data_2,length_1,length_2):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        
        batch_size_1, seq_len_1 = data_1.size()
        batch_size_2, seq_len_2 = data_2.size()

        S1 = self.embed(data_1)
        m1 = (data_1 == 1)
        m1 = m1.unsqueeze(2).repeat(1, 1, self.emb_size).float()
        S1 = m1 * S1 + (1-m1) * S1.clone().detach()
        
 

        S2 = self.embed(data_2)
        m2 = (data_2 == 1)
        m2 = m2.unsqueeze(2).repeat(1, 1, self.emb_size).float()
        S2 = m2 * S2 + (1-m2) * S2.clone().detach()

        
        S1 = self.conv1(S1.transpose(1,2)).transpose(1,2)
        S1 = F.relu(S1.contiguous().view(-1, S1.size(-1))).view(BATCH_SIZE, S1.size(1), S1.size(-1))


        S1 = self.conv2(S1.transpose(1,2)).transpose(1,2)
        S1 = F.relu(S1.contiguous().view(-1, S1.size(-1))).view(BATCH_SIZE, S1.size(1), S1.size(-1))
        
        S1 = torch.max(S1, dim=1)

        S2 = self.conv1(S2.transpose(1,2)).transpose(1,2)
        S2 = F.relu(S2.contiguous().view(-1, S2.size(-1))).view(BATCH_SIZE, S2.size(1), S2.size(-1))

        S2 = self.conv2(S2.transpose(1,2)).transpose(1,2)
        S2 = F.relu(S2.contiguous().view(-1, S2.size(-1))).view(BATCH_SIZE, S2.size(1), S2.size(-1))
        S2 = torch.max(S2, dim=1)
        
        out = torch.cat([S1[0], S2[0]], 1)
        #print(out.size())
        
        logits = self.linear(out)

        return logits
    
    

In [11]:

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data_1,data_2, lengths_1,lengths_2, labels in loader:
        data_batch1,data_batch2, length_batch1,length_batch2, label_batch = data_1.cuda(),data_2.cuda(), lengths_1.cuda(),lengths_2.cuda(), labels.cuda()
        outputs = F.softmax(model(data_batch1,data_batch2, length_batch1,length_batch2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
       
        #print(outputs)
        total += label_batch.size(0)
        correct += predicted.eq(label_batch.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [12]:

learning_rate = 0.005
num_epochs = 10 # number epoch to train


model = SNLI_CNN(loaded_embeddings_ft,emb_size=300, hidden_size=200, num_layers=2, num_classes=3,vocab_size=50000).cuda()
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for i, (data_1,data_2, lengths_1,lengths_2, labels) in enumerate(train_loader):
        model.train()
        data_batch1,data_batch2,length_batch1,length_batch2,label_batch = data_1.cuda(),data_2.cuda(), lengths_1.cuda(),lengths_2.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(data_batch1,data_batch2, length_batch1,length_batch2)

        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 32 iterations
        if i > 0 and i % (BATCH_SIZE-1) == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                        epoch+1, num_epochs, i+1, len(train_loader), val_acc))




Epoch: [1/10], Step: [100/1000], Validation Acc: 44.0
Epoch: [1/10], Step: [199/1000], Validation Acc: 46.7
Epoch: [1/10], Step: [298/1000], Validation Acc: 43.9
Epoch: [1/10], Step: [397/1000], Validation Acc: 47.1
Epoch: [1/10], Step: [496/1000], Validation Acc: 54.7
Epoch: [1/10], Step: [595/1000], Validation Acc: 51.9
Epoch: [1/10], Step: [694/1000], Validation Acc: 55.8
Epoch: [1/10], Step: [793/1000], Validation Acc: 58.0
Epoch: [1/10], Step: [892/1000], Validation Acc: 60.7
Epoch: [1/10], Step: [991/1000], Validation Acc: 61.6
Epoch: [2/10], Step: [100/1000], Validation Acc: 60.3
Epoch: [2/10], Step: [199/1000], Validation Acc: 62.1
Epoch: [2/10], Step: [298/1000], Validation Acc: 62.5
Epoch: [2/10], Step: [397/1000], Validation Acc: 62.3
Epoch: [2/10], Step: [496/1000], Validation Acc: 63.2
Epoch: [2/10], Step: [595/1000], Validation Acc: 62.4
Epoch: [2/10], Step: [694/1000], Validation Acc: 65.9
Epoch: [2/10], Step: [793/1000], Validation Acc: 64.6
Epoch: [2/10], Step: [892/10

In [13]:
def outputErrors(loader, model,number=3):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    correct_sen = []
    wrong_sen = []
    
    actual_pre_cor = []
    actual_pre_wor = []
    flag=0
    
    for data_1,data_2, lengths_1,lengths_2, labels in loader:
        data_batch1,data_batch2, length_batch1,length_batch2, label_batch = data_1.cuda(),data_2.cuda(), lengths_1.cuda(),lengths_2.cuda(), labels.cuda()
        outputs = F.softmax(model(data_batch1,data_batch2, length_batch1,length_batch2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        for i in range(len(predicted)):
            if len(correct_sen) >=number and len(wrong_sen) >=number:
                flag=1
                break
            else:
                if predicted[i]==label_batch[i]:
                    #print(data_batch1[i],data_batch2[i])
                    correct_sen.append((data_batch1[i],data_batch2[i]))
                    actual_pre_cor.append((predicted[i],label_batch[i]))
                else:
                    wrong_sen.append((data_batch1[i],data_batch2[i]))
                    actual_pre_wor.append((predicted[i],label_batch[i]))

        if flag==1:
            break                    
    return actual_pre_cor,actual_pre_wor,correct_sen,wrong_sen

actual_pre_cor,actual_pre_wor,cor,wrong = outputErrors(val_loader, model)

---
### Print the Corrected predicitions

The first one is the prediction while the second one is the actual label

In [14]:
count=0
for c in cor:
    print('\n')
    print("0 for neutral; 1 for entailment; 2 for contradiction")

    print (actual_pre_cor[count])
    print('the Premise')
    print((" ").join([ordered_words_ft[i] for i in np.array(c[0]) if i !=0]))
    print('the Hypothesis')
    print((" ").join([ordered_words_ft[i] for i in np.array(c[1]) if i !=0]))

    count+=1
    if count>3:
        break



0 for neutral; 1 for entailment; 2 for contradiction
(tensor([1], device='cuda:0'), tensor(1, device='cuda:0'))
the Premise
A man wearing a suit with a name tag stands reading in front of a microphone as a screen behind him displays a presentation .
the Hypothesis
A man with a microphone is standing in front of a screen .


0 for neutral; 1 for entailment; 2 for contradiction
(tensor([1], device='cuda:0'), tensor(1, device='cuda:0'))
the Premise
The room full of youths reacts emotionally as they <unk> .
the Hypothesis
people <unk> emotional watching something .


0 for neutral; 1 for entailment; 2 for contradiction
(tensor([2], device='cuda:0'), tensor(2, device='cuda:0'))
the Premise
A man selling donuts to a customer during a world exhibition event held in the city of Angeles
the Hypothesis
A woman drinks her coffee in a small cafe .


0 for neutral; 1 for entailment; 2 for contradiction
(tensor([2], device='cuda:0'), tensor(2, device='cuda:0'))
the Premise
The man and woman are bo

---
### Print the Wrong predicitions


The first one is the prediction while the second one is the actual label

In [15]:
count=0
for w in wrong:
    print('\n')
    print("0 for neutral; 1 for entailment; 2 for contradiction")
    print (actual_pre_wor[count])
    
    print('the Premise')
    print((" ").join([ordered_words_ft[i] for i in np.array(w[0]) if i !=0]))
    print('the Hypothesis')
    print((" ").join([ordered_words_ft[i] for i in np.array(w[1]) if i !=0]))

    count+=1
    if count>3:
        break



0 for neutral; 1 for entailment; 2 for contradiction
(tensor([0], device='cuda:0'), tensor(1, device='cuda:0'))
the Premise
Two male curling players are on ice sweeping the path in front of polished rock , a small crowd watches .
the Hypothesis
Several male curling players sweeping their path on ice in front of a rock as a small crowd watches .


0 for neutral; 1 for entailment; 2 for contradiction
(tensor([1], device='cuda:0'), tensor(0, device='cuda:0'))
the Premise
A black girl and white girl walking hand and hand in a busy area of a city with a public train in the background .
the Hypothesis
Some kids take public transit .


0 for neutral; 1 for entailment; 2 for contradiction
(tensor([1], device='cuda:0'), tensor(2, device='cuda:0'))
the Premise
These two people in hats are standing on rocky terrain , with their arms around each other .
the Hypothesis
Two people on rocky terrain waving at each other with both arms .


---
### Output the accuracy results on MultiNLI dataset


In [18]:

res=[]
for g in test_data['genre'].unique():
    test_data_sentence1_indices = token2index_dataset(test_data[test_data['genre']==g]['sentence1'])
    test_data_sentence2_indices = token2index_dataset(test_data[test_data['genre']==g]['sentence2'])
    test_targets_g = test_targets[test_data['genre']==g]
    
    test_dataset = NewsGroupDataset(test_data_sentence1_indices,test_data_sentence2_indices, test_targets_g)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=newsgroup_collate_func,
                                               shuffle=True)
    res.append((g, test_model(test_loader, model)))
    

print("CNN")
print(res)


CNN
[('fiction', 43.91959798994975), ('telephone', 44.875621890547265), ('slate', 40.31936127744511), ('government', 46.8503937007874), ('travel', 43.58452138492871)]
