In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('hw2_data/snli_train.tsv',delimiter='\t')
val = pd.read_csv("hw2_data/snli_val.tsv",delimiter='\t')


train_m = pd.read_csv('hw2_data/mnli_train.tsv',delimiter='\t')
val_m = pd.read_csv("hw2_data/mnli_val.tsv",delimiter='\t')

In [3]:
val_m['genre'].unique()

array(['fiction', 'telephone', 'slate', 'government', 'travel'], dtype=object)

In [4]:
train_data = train[['sentence1','sentence2']]
train_targets = np.array(train['label'])

val_data = val[['sentence1','sentence2']]
val_targets = np.array(val['label'])


train_data_m = train_m[['sentence1','sentence2','genre']]
train_targets_m = np.array(train_m['label'])

val_data_m = val_m[['sentence1','sentence2','genre']]
val_targets_m = np.array(val_m['label'])

In [5]:
train_targets = np.where(train_targets=='entailment',1,train_targets)
train_targets = np.where(train_targets=='neutral',0,train_targets)
train_targets = np.where(train_targets=='contradiction',2,train_targets)


val_targets = np.where(val_targets=='entailment',1,val_targets)
val_targets = np.where(val_targets=='neutral',0,val_targets)
val_targets = np.where(val_targets=='contradiction',2,val_targets)

train_targets_m = np.where(train_targets_m=='entailment',1,train_targets_m)
train_targets_m = np.where(train_targets_m=='neutral',0,train_targets_m)
train_targets_m = np.where(train_targets_m=='contradiction',2,train_targets_m)


val_targets_m = np.where(val_targets_m=='entailment',1,val_targets_m)
val_targets_m = np.where(val_targets_m=='neutral',0,val_targets_m)
val_targets_m = np.where(val_targets_m=='contradiction',2,val_targets_m)


In [6]:
ft_home = './'
words_to_load = 50000

# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1


with open(ft_home + 'wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft = np.zeros((words_to_load+2, 300))
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = ['<pad>','<unk>']
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
        words_ft[s[0]] = i+2
        idx2words_ft[i+2] = s[0]
        ordered_words_ft.append(s[0])
    words_ft['<pad>'] = PAD_IDX 
    words_ft['<unk>'] = UNK_IDX   
    

In [7]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [words_ft[token] if token in words_ft else UNK_IDX for token in tokens.split(' ')]
        indices_data.append(index_list)
    return indices_data

train_data_sentence1_indices = token2index_dataset(train_data['sentence1'])
train_data_sentence2_indices = token2index_dataset(train_data['sentence2'])

val_data_sentence1_indices = token2index_dataset(val_data['sentence1'])
val_data_sentence2_indices = token2index_dataset(val_data['sentence2'])

# double checking
print ("Train dataset of sentence1 size is {}".format(len(train_data_sentence1_indices)))
print ("Train dataset of sentence2 size is {}".format(len(train_data_sentence2_indices)))


Train dataset of sentence1 size is 100000
Train dataset of sentence2 size is 100000


In [8]:
#MAX_SENTENCE_LENGTH = 80
    
import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list_1,data_list_2, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list_1 = data_list_1
        self.data_list_2 = data_list_2
        self.target_list = target_list
        assert (len(self.data_list_1) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """

        token_idx_1 = self.data_list_1[key]
        token_idx_2 = self.data_list_2[key]
        label = self.target_list[key]
        return [token_idx_1,token_idx_2, len(token_idx_1),len(token_idx_2), label]

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list_1 = []
    data_list_2 = []
    label_list = []
    length_list_1 = []
    length_list_2 = []

    for datum in batch:
        label_list.append(datum[4])
        length_list_1.append(datum[2])
        #print(datum[2])
        length_list_2.append(datum[3])
        #print(datum[3])

    # padding
    for datum in batch:
        
        max_s1 = max (length_list_1) 
        max_s2 = max (length_list_2) 


        padded_vec_1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,max_s1-datum[2])), 
                                mode="constant", constant_values=0)
        data_list_1.append(padded_vec_1)
        
        #print(data_list_1[0])
        
        padded_vec_2 = np.pad(np.array(datum[1]), 
                                pad_width=((0,max_s2-datum[3])), 
                                mode="constant", constant_values=0)
        data_list_2.append(padded_vec_2)
        #print(data_list_2[0])
        
    return [torch.from_numpy(np.array(data_list_1)),torch.from_numpy(np.array(data_list_2)),torch.from_numpy(np.array(length_list_1)), torch.LongTensor(length_list_2),torch.LongTensor(label_list)]




In [9]:

# Function for testing the model
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data_1,data_2, lengths_1,lengths_2, labels in loader:
        data_batch1,data_batch2, length_batch1,length_batch2, label_batch = data_1.cuda(),data_2.cuda(), lengths_1.cuda(),lengths_2.cuda(), labels.cuda()
        outputs = F.softmax(model(data_batch1,data_batch2, length_batch1,length_batch2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
       
        #print(outputs)
        total += label_batch.size(0)
        correct += predicted.eq(label_batch.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [10]:

# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class SNLI_RNN(nn.Module):
    """
    SNLI classification model
    """
    def __init__(self, embeddings,vocab_size,emb_dim,hidden_size,num_layers,num_classes,FC_hiden):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(SNLI_RNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.emb_dim = emb_dim
        
        # pay attention to padding_idx 
        self.embed = nn.Embedding.from_pretrained(torch.from_numpy(embeddings).float(), freeze=False, sparse=False)

                

        
        self.rnn = nn.GRU(input_size=emb_dim, hidden_size=hidden_size,num_layers=num_layers,batch_first=True,
                        bidirectional=True)

        self.linear = nn.Sequential(
            #nn.Linear(hidden_size,FC_hiden),
            nn.Linear(hidden_size*2,FC_hiden),
            nn.ReLU(),
            #nn.Dropout(),
            nn.Linear(FC_hiden,num_classes),
            nn.ReLU())     
        
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers*2, batch_size, self.hidden_size).cuda()

        return hidden
    
    
    def forward(self, data_1,data_2, length_1,length_2):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        
        batch_size_1, seq_len_1 = data_1.size()
        batch_size_2, seq_len_2 = data_2.size()

        hidden1 = self.init_hidden(batch_size_1)
        hidden2 = self.init_hidden(batch_size_2)
   
        #S1 = self.embed(data_1)
        #print(data_1.size())
        S1 = self.embed(data_1)
        m1 = (data_1 == 1)
        #print(m1)
        m1 = m1.unsqueeze(2).repeat(1, 1, self.emb_dim).float()
        S1 = m1 * S1 + (1-m1) * S1.clone().detach()
        
 
        #print(S1.size())

        S2 = self.embed(data_2)
        m2 = (data_2 == 1)
        m2 = m2.unsqueeze(2).repeat(1, 1, self.emb_dim).float()
        S2 = m2 * S2 + (1-m2) * S2.clone().detach()

        # sort the seqs in des order
        _, idx_sort_1 = torch.sort(length_1, dim=0, descending=True)
        _, idx_unsort_1 = torch.sort(idx_sort_1, dim=0)
        S1 = S1.index_select(0, idx_sort_1)
        length_1 = list(length_1[idx_sort_1])

        _, idx_sort_2 = torch.sort(length_2, dim=0, descending=True)
        _, idx_unsort_2 = torch.sort(idx_sort_2, dim=0)
        S2 = S2.index_select(0, idx_sort_2)
        length_2 = list(length_2[idx_sort_2])
        
        #print("batch size",S1.size())

        # transform pytorch tensor to padded sequence, pack padded sequence
        S1 = torch.nn.utils.rnn.pack_padded_sequence(S1, np.array(length_1), batch_first=True)
        S2 = torch.nn.utils.rnn.pack_padded_sequence(S2, np.array(length_2), batch_first=True)
       
        _,S1 = self.rnn(S1,hidden1)
        _,S2 = self.rnn(S2,hidden2)
        
        #print("batch size",S1.size())
        
        
        S1 = torch.sum(S1,0)
        S2 = torch.sum(S2,0)
        
        #print("batch size",S2.size())

        # Unsort
        S1 = S1.index_select(0,idx_unsort_1)
        S2 = S2.index_select(0,idx_unsort_2)

        
        #out = torch.mul(S1, S2)
        out = torch.cat([S1, S2], 1)
        #print(out.size())

             
        # return logits
        out = self.linear(out)
        #print(out.size())

        return out


In [16]:
BATCH_SIZE = 100

train_dataset = NewsGroupDataset(train_data_sentence1_indices,train_data_sentence2_indices, train_targets)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset = NewsGroupDataset(val_data_sentence1_indices,val_data_sentence2_indices, val_targets)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

# Train the best RNN model
learning_rate = 0.0005
num_epochs = 20 # number epoch to train

model = SNLI_RNN(loaded_embeddings_ft,50000,300,250,num_layers=1,num_classes=3,FC_hiden=50).cuda()
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    epoch_loss=[]
    val_acc = 0
    tra_acc = 0
    for i, (data_1,data_2, lengths_1,lengths_2, labels) in enumerate(train_loader):
        model.train()
        data_batch1,data_batch2,length_batch1,length_batch2,label_batch = data_1.cuda(),data_2.cuda(), lengths_1.cuda(),lengths_2.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(data_batch1,data_batch2, length_batch1,length_batch2)
        loss = criterion(outputs, label_batch)
        loss.backward()
        epoch_loss.append(loss)
        optimizer.step()
        # validate every 32 iterations
        if i > 0 and i % (BATCH_SIZE-1) == 0:
            # validate
            tra_acc = test_model(train_loader, model)
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                        epoch+1, num_epochs, i+1, len(train_loader), val_acc))


Epoch: [1/30], Step: [100/1000], Validation Acc: 46.1
Epoch: [1/30], Step: [199/1000], Validation Acc: 55.1
Epoch: [1/30], Step: [298/1000], Validation Acc: 56.5
Epoch: [1/30], Step: [397/1000], Validation Acc: 59.8
Epoch: [1/30], Step: [496/1000], Validation Acc: 59.6
Epoch: [1/30], Step: [595/1000], Validation Acc: 58.3
Epoch: [1/30], Step: [694/1000], Validation Acc: 60.4
Epoch: [1/30], Step: [793/1000], Validation Acc: 59.3
Epoch: [1/30], Step: [892/1000], Validation Acc: 60.1
Epoch: [1/30], Step: [991/1000], Validation Acc: 60.2
Epoch: [2/30], Step: [100/1000], Validation Acc: 59.7
Epoch: [2/30], Step: [199/1000], Validation Acc: 60.1
Epoch: [2/30], Step: [298/1000], Validation Acc: 60.6
Epoch: [2/30], Step: [397/1000], Validation Acc: 60.6
Epoch: [2/30], Step: [496/1000], Validation Acc: 60.9
Epoch: [2/30], Step: [595/1000], Validation Acc: 61.9
Epoch: [2/30], Step: [694/1000], Validation Acc: 60.7
Epoch: [2/30], Step: [793/1000], Validation Acc: 61.5
Epoch: [2/30], Step: [892/10

Epoch: [16/30], Step: [199/1000], Validation Acc: 68.9
Epoch: [16/30], Step: [298/1000], Validation Acc: 70.4
Epoch: [16/30], Step: [397/1000], Validation Acc: 69.0
Epoch: [16/30], Step: [496/1000], Validation Acc: 69.6
Epoch: [16/30], Step: [595/1000], Validation Acc: 69.5
Epoch: [16/30], Step: [694/1000], Validation Acc: 70.2
Epoch: [16/30], Step: [793/1000], Validation Acc: 69.9
Epoch: [16/30], Step: [892/1000], Validation Acc: 68.5
Epoch: [16/30], Step: [991/1000], Validation Acc: 70.7
Epoch: [17/30], Step: [100/1000], Validation Acc: 69.7
Epoch: [17/30], Step: [199/1000], Validation Acc: 71.0
Epoch: [17/30], Step: [298/1000], Validation Acc: 69.1
Epoch: [17/30], Step: [397/1000], Validation Acc: 69.7
Epoch: [17/30], Step: [496/1000], Validation Acc: 69.6
Epoch: [17/30], Step: [595/1000], Validation Acc: 70.3
Epoch: [17/30], Step: [694/1000], Validation Acc: 70.9
Epoch: [17/30], Step: [793/1000], Validation Acc: 68.9
Epoch: [17/30], Step: [892/1000], Validation Acc: 69.3
Epoch: [17

---
### Without tuning

In [19]:
res_without_tun = []
for g in train_data_m['genre'].unique():
    
    val_data_sentence1_indices = token2index_dataset(val_data_m[val_data_m['genre']==g]['sentence1'])
    val_data_sentence2_indices = token2index_dataset(val_data_m[val_data_m['genre']==g]['sentence2'])
    val_targets_g = val_targets_m[val_data_m['genre']==g]

    val_dataset = NewsGroupDataset(val_data_sentence1_indices,val_data_sentence2_indices, val_targets_g)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=newsgroup_collate_func,
                                               shuffle=True)
    
    val_acc = test_model(val_loader, model)
    
    
    res_without_tun.append((g,val_acc))

print(res_without_tun)


[('telephone', 48.95522388059702), ('fiction', 45.82914572864322), ('slate', 44.41117764471058), ('government', 45.17716535433071), ('travel', 45.21384928716904)]


---
### With tuning

In [22]:
torch.save(model.state_dict(),"B_rnn_model_states")

In [16]:
learning_rate = 0.00001
num_epochs = 1 # number epoch to train

BATCH_SIZE = 20

res_with_tun = []
for g in train_data_m['genre'].unique():
    
    print(g)
    
    train_data_sentence1_indices_m = token2index_dataset(train_data_m[train_data_m['genre']==g]['sentence1'])
    train_data_sentence2_indices_m = token2index_dataset(train_data_m[train_data_m['genre']==g]['sentence2'])
    train_targets_g_m = train_targets_m[train_data_m['genre']==g]
    
    
    val_data_sentence1_indices_m = token2index_dataset(val_data_m[val_data_m['genre']==g]['sentence1'])
    val_data_sentence2_indices_m = token2index_dataset(val_data_m[val_data_m['genre']==g]['sentence2'])
    val_targets_g_m = val_targets_m[val_data_m['genre']==g]
    
    train_dataset_m = NewsGroupDataset(train_data_sentence1_indices_m,train_data_sentence2_indices_m, train_targets_g_m)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset_m, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=newsgroup_collate_func,
                                               shuffle=True)

    val_dataset_m = NewsGroupDataset(val_data_sentence1_indices_m,val_data_sentence2_indices_m, val_targets_g_m)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset_m, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=newsgroup_collate_func,
                                               shuffle=True)

    for epoch in range(num_epochs):
        
        model_clone = SNLI_RNN(loaded_embeddings_ft,50000,300,250,num_layers=1,num_classes=3,FC_hiden=50).cuda()
        model_clone.load_state_dict(torch.load("B_rnn_model_states"))
            # Criterion and Optimizer
        criterion = torch.nn.CrossEntropyLoss()  
        optimizer = torch.optim.Adam(model_clone.parameters(), lr=learning_rate)

        epoch_loss=[]
        val_acc = 0
        tra_acc = 0
        for i, (data_1,data_2, lengths_1,lengths_2, labels) in enumerate(train_loader):

            model_clone.train()
            
            data_batch1,data_batch2,length_batch1,length_batch2,label_batch = data_1.cuda(),data_2.cuda(), lengths_1.cuda(),lengths_2.cuda(), labels.cuda()
            optimizer.zero_grad()
            outputs = model_clone(data_batch1,data_batch2, length_batch1,length_batch2)
            loss = criterion(outputs, label_batch)
            loss.backward()
            epoch_loss.append(loss)
            optimizer.step()
            # validate every 32 iterations
            if i > 0 and i % (BATCH_SIZE-1) == 0:
                # validate
                val_acc = test_model(val_loader, model_clone)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                            epoch+1, num_epochs, i+1, len(train_loader), val_acc))
    res_with_tun.append((g,val_acc))



telephone
Epoch: [1/1], Step: [20/214], Validation Acc: 45.67164179104478
Epoch: [1/1], Step: [39/214], Validation Acc: 47.06467661691542
Epoch: [1/1], Step: [58/214], Validation Acc: 47.06467661691542
Epoch: [1/1], Step: [77/214], Validation Acc: 46.26865671641791
Epoch: [1/1], Step: [96/214], Validation Acc: 45.87064676616915
Epoch: [1/1], Step: [115/214], Validation Acc: 46.26865671641791
Epoch: [1/1], Step: [134/214], Validation Acc: 45.27363184079602
Epoch: [1/1], Step: [153/214], Validation Acc: 46.169154228855724
Epoch: [1/1], Step: [172/214], Validation Acc: 46.069651741293534
Epoch: [1/1], Step: [191/214], Validation Acc: 45.87064676616915
Epoch: [1/1], Step: [210/214], Validation Acc: 46.56716417910448
fiction
Epoch: [1/1], Step: [20/192], Validation Acc: 47.93969849246231
Epoch: [1/1], Step: [39/192], Validation Acc: 45.82914572864322
Epoch: [1/1], Step: [58/192], Validation Acc: 46.33165829145729
Epoch: [1/1], Step: [77/192], Validation Acc: 47.8391959798995
Epoch: [1/1], S

In [17]:
print(res_with_tun)

[('telephone', 46.56716417910448), ('fiction', 45.7286432160804), ('slate', 44.11177644710579), ('government', 46.25984251968504), ('travel', 43.9918533604888)]
