In [2]:
import numpy as np#, pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import nltk
import os.path as osp
from nltk.util import ngrams
import matplotlib.pyplot as plt
%matplotlib inline
from torch.utils.data.sampler import SubsetRandomSampler
random.seed(134)

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

In [3]:
train_tmp = open("hw2_data/mnli_train.tsv",'r+', encoding="utf-8").read().split('\n')
train_data = [row.split('\t') for row in train_tmp][1:-1]
val_tmp = open("hw2_data/mnli_val.tsv",'r+', encoding="utf-8").read().split('\n')
val_data = [row.split('\t') for row in val_tmp][1:-1]

In [4]:
ft_home = './'
words_to_load = 900000

with open(ft_home + 'wiki-news-300d-1M.vec','r+', encoding="utf-8") as f:
    loaded_embeddings_ft = np.zeros((2+words_to_load, 300))
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = []
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
        words_ft[s[0]] = i+2
        idx2words_ft[i+2] = s[0]
        ordered_words_ft.append(s[0])

In [5]:
idx2words_ft[0] = '<pad>'
idx2words_ft[1] = '<unk>'
words_ft['<pad>'] = 0
words_ft['<unk>'] = 1

In [6]:
train_s1 = [row[0] for row in train_data]
train_s2 = [row[1] for row in train_data]
val_s1 = [row[0] for row in val_data]
val_s2 = [row[1] for row in val_data]
train_label = [row[2] for row in train_data]
val_label = [row[2] for row in val_data]
train_genre = [row[3] for row in train_data]
val_genre = [row[3] for row in val_data]

In [7]:
#tokenize
def tokenize_dataset(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    
    for sample in dataset:
#         print(type(sample))
        tokens = nltk.word_tokenize(sample.lower())
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens
print ("Tokenizing val data")
val_s1_tokens, _ = tokenize_dataset(val_s1)
pkl.dump(val_s1_tokens, open("val_s1_tokens.p", "wb"))
val_s2_tokens, _ = tokenize_dataset(val_s2)
pkl.dump(val_s2_tokens, open("val_s2_tokens.p", "wb"))

# train set tokens
print ("Tokenizing train data")
train_s1_tokens, all_train_s1_tokens = tokenize_dataset(train_s1)
train_s2_tokens, all_train_s2_tokens = tokenize_dataset(train_s2)


Tokenizing val data
Tokenizing train data


In [8]:
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [words_ft[token] if token in words_ft else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_s1_indices = token2index_dataset(train_s1_tokens)
val_s1_indices = token2index_dataset(val_s1_tokens)
train_s2_indices = token2index_dataset(train_s2_tokens)
val_s2_indices = token2index_dataset(val_s2_tokens)

# double checking
print ("Train dataset size is {}".format(len(train_s1_indices)))
print ("Train dataset size is {}".format(len(train_s2_indices)))
print ("Val dataset size is {}".format(len(val_s1_indices)))
print ("Val dataset size is {}".format(len(val_s2_indices)))

Train dataset size is 20000
Train dataset size is 20000
Val dataset size is 5000
Val dataset size is 5000


In [9]:
MAX_SENTENCE_LENGTH = 30
class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, s1_data, s2_data, target_list, genre_list):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.s1_data = s1_data
        self.s2_data = s2_data
        self.target_list = target_list
        self.genre_list = genre_list
        assert (len(self.s1_data) == len(self.target_list))
        assert (len(self.s2_data) == len(self.target_list))
#         self.words_ft = words_ft

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        item = dict()
        
        item['s1_word_idx'] = self.s1_data[key][:MAX_SENTENCE_LENGTH]
        item['s2_word_idx'] = self.s2_data[key][:MAX_SENTENCE_LENGTH]
        item['label'] = self.target_list[key]
        item['genre'] = self.genre_list[key]
        return item

In [10]:
def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    s1_list, s2_list = [],[]
    label_list = []
    genre_list = []
#     length_list = []
#     print(batch)
    for datum in batch:
        label_list.append(datum['label'])
        genre_list.append(datum['genre'])
#         length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum['s1_word_idx']),
                                pad_width=((0,MAX_SENTENCE_LENGTH-len(datum['s1_word_idx']))),
                                mode="constant", constant_values=0)
        s1_list.append(list(padded_vec))
    for datum in batch:
        padded_vec = np.pad(np.array(datum['s2_word_idx']),
                                pad_width=((0,MAX_SENTENCE_LENGTH-len(datum['s2_word_idx']))),
                                mode="constant", constant_values=0)
        s2_list.append(list(padded_vec))
    label_list = [{'entailment':0,'contradiction':1,'neutral':2}[k] for k in label_list]
    genre_list = [{'slate':0,'telephone':1,'government':2,'travel':3, 'fiction':4}[k] for k in genre_list]
    return [torch.from_numpy(np.array(s1_list)),torch.from_numpy(np.array(s2_list)), torch.LongTensor(label_list),torch.LongTensor(genre_list)]


In [11]:
train_loader = torch.utils.data.DataLoader(
    SNLIDataset(train_s1_indices, train_s2_indices, train_label, train_genre),batch_size=BATCH_SIZE,shuffle=True,collate_fn=vocab_collate_func)

val_loader = torch.utils.data.DataLoader(
    SNLIDataset(val_s1_indices, val_s2_indices, val_label, val_genre),batch_size=BATCH_SIZE,shuffle=True,collate_fn=vocab_collate_func)

In [12]:
#get genre dataloader
def getgenreloader_train(genre):
    genrels = train_genre
    subid = [genrels[i] == genre for i in range(len(genrels))]
    train_s1_indices_sub = [train_s1_indices[i] for i in range(len(subid)) if subid[i]]
    train_s2_indices_sub = [train_s2_indices[i] for i in range(len(subid)) if subid[i]]
    train_label_sub = [train_label[i] for i in range(len(subid)) if subid[i]]
    train_genre_sub = [train_genre[i] for i in range(len(subid)) if subid[i]]
    return torch.utils.data.DataLoader(
        SNLIDataset(train_s1_indices_sub, train_s2_indices_sub, train_label_sub, train_genre_sub),
        batch_size=BATCH_SIZE,shuffle=True,collate_fn=vocab_collate_func)
def getgenreloader_val(genre):
    genrels = val_genre
    subid = [genrels[i] == genre for i in range(len(genrels))]
    val_s1_indices_sub = [val_s1_indices[i] for i in range(len(subid)) if subid[i]]
    val_s2_indices_sub = [val_s2_indices[i] for i in range(len(subid)) if subid[i]]
    val_label_sub = [val_label[i] for i in range(len(subid)) if subid[i]]
    val_genre_sub = [val_genre[i] for i in range(len(subid)) if subid[i]]
    return torch.utils.data.DataLoader(
        SNLIDataset(val_s1_indices_sub, val_s2_indices_sub, val_label_sub, val_genre_sub),
        batch_size=BATCH_SIZE,shuffle=True,collate_fn=vocab_collate_func)

In [13]:
def test_model(model, sp_loader, criterion=torch.nn.CrossEntropyLoss()):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    
    for sample in sp_loader:
        outputs = F.softmax(model(sample[0], sample[1]), dim=1)
#         loss = criterion(outputs, sample[2])
#         sumloss += loss.item()
        predicted = outputs.max(1, keepdim=True)[1].view(-1)
        total += len(predicted)
        truths = sample[2]
        correct += predicted.eq(truths.view_as(predicted)).sum().item()

    return (100 * correct / total)


In [97]:
for g in set(val_genre):
    print(g)

slate
telephone
government
travel
fiction


### Load model RNN

In [14]:
class RNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers, num_classes, dropoutp=0.3):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super().__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix), freeze=True)
        num_ebd, emb_size = weights_matrix.shape
        self.rnn1 = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, dropout=dropoutp, bidirectional=True)
        self.rnn2 = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, dropout=dropoutp, bidirectional=True)
        self.tanh = nn.Tanh()
#         self.linear = nn.Linear(hidden_size*2, num_classes)
        self.mlp = nn.Sequential(
            
            nn.Linear(hidden_size*2*2,400),
            #nn.Linear(rnn_output_dim, output_dim),
            nn.ReLU(),
            nn.Dropout(p=dropoutp),
            nn.Linear(400,num_classes),
        )

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers*2, batch_size, self.hidden_size)

        return hidden

    def forward(self, s1, s2):
        # reset hidden state

#         batch_size = x.size()
        batch_size1 = s1.size()[0]
        batch_size2 = s2.size()[0]
    
#         print(len(s1))

        self.hidden1 = self.init_hidden(batch_size1)
        self.hidden2 = self.init_hidden(batch_size2)

        # get embedding of characters
        s1_embed = self.embedding(s1)
        s2_embed = self.embedding(s2)
        # pack padded sequence
#         s1_embed = torch.nn.utils.rnn.pack_padded_sequence(s1_embed, lengths.numpy(), batch_first=True)
        # fprop though RNN
        s1_rnn_out, self.hidden1 = self.rnn1(s1_embed, self.hidden1)
        s2_rnn_out, self.hidden2 = self.rnn2(s2_embed, self.hidden2)
        # undo packing
#         rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        # sum hidden activations of RNN across time
        s1_rnn_out = torch.sum(s1_rnn_out, dim=1)
        s2_rnn_out = torch.sum(s2_rnn_out, dim=1)
        
        rnn_out = torch.cat([s1_rnn_out, s2_rnn_out], 1)
        
        logits = self.mlp(rnn_out)
        return logits

In [102]:
model = RNN(weights_matrix=loaded_embeddings_ft, hidden_size=100, num_layers=1, num_classes=3) 
# model.load_state_dict()
model.load_state_dict(torch.load('rnn_p_0.pth')['model'])
# model = torch.load('rnn_p_0.pth')

In [124]:
for g in set(val_genre):
    loader = getgenreloader_val(g)
    print(g, ' ', test_model(model, loader))

slate  45.60878243512974
telephone  43.38308457711443
government  46.653543307086615
travel  45.926680244399186
fiction  42.41206030150754


### CNN

In [125]:
class CNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers, num_classes, k=3, p=1, dropoutp=0.3):
        super().__init__()
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix))
        num_ebd, emb_size = weights_matrix.shape
        
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=k, padding=p)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=k, padding=p)
#         self.cnn3 = nn.Conv1d(emb_size, hidden_size, kernel_size=5, padding=2)
#         self.cnn4 = nn.Conv1d(hidden_size, hidden_size, kernel_size=5, padding=2)
        self.tanh = nn.Tanh()
        self.relu=nn.ReLU()  
        self.dropout = nn.Dropout(p = dropoutp)
#         self.maxpool = nn.MaxPool1d(MAX_SENTENCE_LENGTH)
#         self.linear = nn.Linear(hidden_size*2, num_classes)
        self.linear1 = nn.Linear(hidden_size*2, hidden_size*2)
        self.linear2 = nn.Linear(hidden_size*2, num_classes)
        
    def forward(self, s1, s2):
        # reset hidden state

#         batch_size = x.size()
        batch_size1, seq_len1 = s1.size()
        batch_size2, seq_len2 = s2.size()
#         print(len(s1)）

        # get embedding of characters
        s1_embed = self.embedding(s1)
        s2_embed = self.embedding(s2)
        
        hidden1 = self.conv1(s1_embed.transpose(1,2)).transpose(1,2)
        hidden2 = self.conv1(s2_embed.transpose(1,2)).transpose(1,2)
        
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size1, hidden1.size(1), hidden1.size(-1))
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size2, hidden2.size(1), hidden2.size(-1))
        
        hidden1 = self.conv2(hidden1.transpose(1,2)).transpose(1,2)
        hidden2 = self.conv2(hidden2.transpose(1,2)).transpose(1,2)
        
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size1, hidden1.size(1), hidden1.size(-1))
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size2, hidden2.size(1), hidden2.size(-1))
        
        hidden1 = torch.sum(hidden1, dim=1)
        hidden2 = torch.sum(hidden2, dim=1)
        
        cnn_out = torch.cat([hidden1, hidden2], 1)
        logits = self.relu(self.linear1(cnn_out))
        logits = self.dropout(logits)
        logits = self.linear2(logits)
            
        return logits

In [127]:
model = CNN(weights_matrix=loaded_embeddings_ft, hidden_size=300, num_layers=1, num_classes=3, k=5) 
# model.load_state_dict()
model.load_state_dict(torch.load('cnn_final.pth')['model'])

In [128]:
for g in set(val_genre):
    loader = getgenreloader_val(g)
    print(g, ' ', test_model(model, loader))

slate   40.31936127744511
telephone   44.37810945273632
government   42.32283464566929
travel   43.788187372708755
fiction   47.1356783919598


### Fine tuning using RNN

In [18]:
for g in set(val_genre):
    print('-------------------------')
    print(g)
    model = RNN(weights_matrix=loaded_embeddings_ft, hidden_size=100, num_layers=1, num_classes=3) 
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    model.load_state_dict(torch.load('rnn_p_0.pth')['model'])
    num_epochs = 3

    g_train_loader = getgenreloader_train(g)
    g_val_loader = getgenreloader_train(g)

    for epoch in range(num_epochs):
        for i, sample in enumerate(g_train_loader):
            model.train()
            optimizer.zero_grad()
                # Forward pass
            output = model(sample[0], sample[1])
            label = sample[2]
            loss = criterion(output, label)

                # Backward and optimize
            loss.backward()
            optimizer.step()
                # validate every 10 iterations
            if i > 0 and i % 40 == 0:
        #             validate
                val_acc = test_model(model, g_val_loader)

                print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                           epoch+1, num_epochs, i+1, len(g_train_loader), val_acc))


-------------------------
slate
Epoch: [1/3], Step: [41/126], Validation Acc: 45.13164431197218
Epoch: [1/3], Step: [81/126], Validation Acc: 46.42324888226528
Epoch: [1/3], Step: [121/126], Validation Acc: 47.64033780427223
Epoch: [2/3], Step: [41/126], Validation Acc: 48.484848484848484
Epoch: [2/3], Step: [81/126], Validation Acc: 49.72677595628415
Epoch: [2/3], Step: [121/126], Validation Acc: 51.465474416294086
Epoch: [3/3], Step: [41/126], Validation Acc: 51.8628912071535
Epoch: [3/3], Step: [81/126], Validation Acc: 53.129657228017884
Epoch: [3/3], Step: [121/126], Validation Acc: 52.831594634873326
-------------------------
government
Epoch: [1/3], Step: [41/122], Validation Acc: 50.090136492402785
Epoch: [1/3], Step: [81/122], Validation Acc: 51.94437290754571
Epoch: [1/3], Step: [121/122], Validation Acc: 53.05176409992274
Epoch: [2/3], Step: [41/122], Validation Acc: 54.69997424671646
Epoch: [2/3], Step: [81/122], Validation Acc: 56.348184393510174
Epoch: [2/3], Step: [121/1