In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stanford-natural-language-inference-corpus/README.txt
/kaggle/input/stanford-natural-language-inference-corpus/snli_1.0_test.csv
/kaggle/input/stanford-natural-language-inference-corpus/snli_1.0_train.csv
/kaggle/input/stanford-natural-language-inference-corpus/snli_1.0_dev.csv


In [60]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
import transformers
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from ignite.engine import Engine, Events
from ignite.metrics import Accuracy, Loss
from ignite.contrib.handlers.param_scheduler import create_lr_scheduler_with_warmup, LRScheduler
from torch.optim.lr_scheduler import ExponentialLR
from ignite.handlers import EarlyStopping, Checkpoint, DiskSaver, global_step_from_engine
from ignite.engine.events import EventEnum
from tqdm.notebook import tqdm

In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
for param in model.base_model.parameters():
    param.requires_grad = False

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [5]:
model = model.to(device)

In [6]:
df = pd.read_csv('/kaggle/input/stanford-natural-language-inference-corpus/snli_1.0_test.csv', 
                 usecols=["gold_label", "sentence1", "sentence2"])
df = df.loc[df["gold_label"] != "-"]

In [7]:
class Dataset_from_encoding(Dataset):
    """
    """
    def __init__(self, p_encodings, h_encodings, labels):
        self.p_encodings = p_encodings
        self.h_encodings = h_encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {}
        item["p"] = {key: val[idx].clone().detach() for key, val in self.p_encodings.items()}
        item["h"] = {key: val[idx].clone().detach() for key, val in self.h_encodings.items()}
        item["labels"] = torch.tensor(self._get_label(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)
    
    def _get_label(self, x):
        label = {'contradiction': 0,
                 'neutral': 1,
                 'entailment': 2,}

        return label[x]

In [8]:
def get_train_test(df, test_size=0.2):
    """
    """
    train, test = train_test_split(df, test_size=test_size, shuffle=True)
    train_p_encodings = tokenizer(train.sentence1.tolist(), 
                                return_tensors="pt",
                                max_length=128,
                                truncation=True,
                                padding=True).to(device)
    train_h_encodings = tokenizer(train.sentence2.tolist(), 
                                return_tensors="pt",
                                max_length=128,
                                truncation=True,
                                padding=True).to(device)
    test_p_encodings = tokenizer(test.sentence1.tolist(), 
                                return_tensors="pt",
                                max_length=128,
                                truncation=True,
                                padding=True).to(device)
    test_h_encodings = tokenizer(test.sentence2.tolist(), 
                               return_tensors="pt",
                               max_length=128,
                               truncation=True,
                               padding=True).to(device)

    train_ds = Dataset_from_encoding(train_p_encodings, train_h_encodings, train["gold_label"].tolist())
    test_ds = Dataset_from_encoding(test_p_encodings, test_h_encodings, test["gold_label"].tolist())

    return train_ds, test_ds

In [9]:
train_ds, test_ds = get_train_test(df)

In [10]:
train_dl = DataLoader(train_ds, 64, shuffle=True)
test_dl = DataLoader(test_ds, 64, shuffle=False)

In [84]:
class mGRU(nn.Module):
    """matchLSTM implementation but using GRU instead of LSTM
    """
    def __init__ (self, options):
        super(mGRU, self).__init__()
        self.options = options
        self.n_embed = 768
        self.n_dim = 300
        self.n_out = 3
        if 'USE_PRETRAINED' in self.options.keys():
            embed_matrix = self.l_en.get_embedding_matrix()
            if embed_matrix is not None:
                print(f"Embedding matrix size {embed_matrix.shape}")
                self.embedding.weight = nn.Parameter(torch.Tensor(embed_matrix))

        self.premise_gru = nn.GRU(self.n_embed, self.n_dim, bidirectional=False).to(device)
        self.hypothesis_gru = nn.GRU(self.n_embed, self.n_dim, bidirectional=False).to(device)
        self.out = nn.Linear(self.n_dim, self.n_out).to(device)

        # Attention Parameters
        if self.options["CUDA"]:
            self.W_s = nn.Parameter(torch.randn(self.n_dim, self.n_dim).cuda())  # n_dim x n_dim
            self.register_parameter('W_s', self.W_s)
            self.W_t = nn.Parameter(torch.randn(self.n_dim, self.n_dim).cuda())  # n_dim x n_dim
            self.register_parameter('W_t', self.W_t)
            self.w_e = nn.Parameter(torch.randn(self.n_dim, 1).cuda()) # n_dim x 1
            self.register_parameter('w_e', self.w_e)
            self.W_m = nn.Parameter(torch.randn(self.n_dim, self.n_dim).cuda())  # n_dim x n_dim
            self.register_parameter('W_m', self.W_m)
        else:
            self.W_s = nn.Parameter(torch.randn(self.n_dim, self.n_dim))
            self.register_parameter('W_s', self.W_s)
            self.W_t = nn.Parameter(torch.randn(self.n_dim, self.n_dim))
            self.register_parameter('W_t', self.W_t)
            self.w_e = nn.Parameter(torch.randn(self.n_dim, 1))
            self.register_parameter('w_e', self.w_e)
            self.W_m = nn.Parameter(torch.randn(self.n_dim, self.n_dim))
            self.register_parameter('W_m', self.W_m)

        # Match GRU parameters.
        self.m_gru = nn.GRU(self.n_dim + self.n_dim, self.n_dim, bidirectional=False).to(device)

    def _init_hidden(self, batch_size):
        """Init hidden matrix for GRU"""
        hidden_p = Variable(torch.zeros(1, batch_size, self.n_dim))
        hidden_h = Variable(torch.zeros(1, batch_size, self.n_dim))
        return hidden_p, hidden_h

    def _attn_gru_init_hidden(self, batch_size):
        """Init for GRU attention"""
        r_0 = Variable(torch.zeros(batch_size, self.n_dim))
        return r_0

    def mask_mult(self, o_t, o_tm1, mask_t):
        """"""
        return (o_t.to(device) * mask_t.to(device)) + (o_tm1.to(device) * (torch.logical_not(mask_t.to(device))))
    
    def _gru_forward(self, gru, encoded_sent, mask_sent, h_0):
        """Stateful GRU for premise/hypothesis

        Parameters:
        ----
        gru: GRU cell
        encoded_sent: embedded matrix of premise/hypothesis sentence
        mask_sent: mask vector for embedded matrix
        h_0: init hidden vector for GRU cell

        Returns:
        ----
        o_s: output of last timestep in each batch from GRU cell. A matrix has shape (T x batch x n_dim)
        h_t: last hidden state vector (1 x batch x n_dim)
        """
        len_seq = encoded_sent.size(0)
        batch_size = encoded_sent.size(1)
        o_s = Variable(torch.zeros(len_seq, batch_size, self.n_dim))
        h_tm1 = h_0.squeeze(0)
        o_tm1 = None

        for ix, (x_t, mask_t) in enumerate(zip(encoded_sent, mask_sent)):
            '''
            x_t : batch x n_embed; 
            mask_t : batch,
            '''
            o_t, h_t = gru(x_t.unsqueeze(0).to(device), 
                           h_tm1.unsqueeze(0).to(device))  # 1 x batch x n_dim
            mask_t = mask_t.unsqueeze(1)  # batch x 1
            h_t = self.mask_mult(h_t[0], h_tm1, mask_t)

            if o_tm1 is not None:
                o_t = self.mask_mult(o_t[0], o_tm1, mask_t)
            o_tm1 = o_t[0] if o_tm1 is None else o_t
            h_tm1 = h_t
            o_s[ix] = o_t

        return o_s, h_t.unsqueeze(0)

    def _attention_forward(self, H_s, mask_H_s, h_t, h_m_tm1=None):
        '''Word-by-word attention.

        Computes the Attention Weights over H_s using h_t (and h_m_tm1 if given)
        Returns an attention weighted representation of H_s, and the alphas.

        Parameters:
        ----
            H_s (T x batch x n_dim): output of all batchs come from GRU cell
            mask_Y (T x batch): mask matrix
            h_t (batch x n_dim): hidden matrix for t-th word in hypothesis (batch)
            h_m_tm1 (batch x n_dim): previous h_m

        Returns:
        ----
            h_m (batch x n_dim)
            alpha (batch x T): attention weight
        '''
        H_s = H_s.transpose(1, 0).cuda()  # batch x T x n_dim
        mask_H_s = mask_H_s.transpose(1, 0)  # batch x T

        Whs = torch.bmm(H_s, self.W_s.unsqueeze(0).expand(H_s.size(0), *self.W_s.size()))  # batch x T x n_dim
        Wht = torch.mm(h_t.cuda(), self.W_t)  # batch x n_dim
        if h_m_tm1 is not None:
            W_r_tm1 = torch.mm(h_m_tm1.cuda(), self.W_m)  # (batch, n_dim)
            Whs += W_r_tm1.unsqueeze(1)
        M = torch.tanh(Whs + Wht.unsqueeze(1).expand(Wht.size(0), H_s.size(1), Wht.size(1)))  # batch x T x n_dim
        alpha = torch.bmm(M, self.w_e.unsqueeze(0).expand(H_s.size(0), *self.w_e.size())).squeeze(-1)  # batch x T
        alpha = alpha + (-1000.0 * (torch.logical_not(mask_H_s)))  # To ensure probability mass doesn't fall on non tokens
        alpha = F.softmax(alpha)
        return torch.bmm(alpha.unsqueeze(1), H_s).squeeze(1), alpha

    def _attn_gru_forward(self, o_h, mask_h, r_0, o_p, mask_p):
        '''Use match-GRU to modeling the matching between the premise and the hypothesis.

        Parameters:
        ----
        o_h : T x batch x n_dim : The hypothesis
        mask_h : T x batch
        r_0 : batch x n_dim :
        o_p : T x batch x n_dim : The premise. Will attend on it at every step
        mask_p : T x batch : the mask for the premise

        Returns:
        ----
            r : batch x n_dim : the last state of the rnn
            alpha_vec : T x batch x T the attn vec at every step
        '''
        seq_len_h = o_h.size(0)
        batch_size = o_h.size(1)
        seq_len_p = o_p.size(0)
        alpha_vec = Variable(torch.zeros(seq_len_h, batch_size, seq_len_p))
        r_tm1 = r_0
        for ix, (h_t, mask_t) in enumerate(zip(o_h, mask_h)):
            '''
                h_t : batch x n_dim
                mask_t : batch,
            '''
            a_t, alpha = self._attention_forward(o_p, mask_p, h_t, r_tm1)   # a_t : batch x n_dim
                                                                            # alpha : batch x T                                                                         
            alpha_vec[ix] = alpha
            m_t = torch.cat([a_t, h_t.cuda()], dim=-1)
            r_t, _ = self.m_gru(m_t.unsqueeze(0).to(device), 
                                r_tm1.unsqueeze(0).to(device))

            mask_t = mask_t.unsqueeze(1)  # batch x 1
            r_t = self.mask_mult(r_t[0], r_tm1, mask_t)
            r_tm1 = r_t

        return r_t, alpha_vec

    def forward(self, encoded_p, encoded_h, training):
        """
        encoded_p (seq_len, batch, n_dim): encoding matrix premise
        encoded_h (seq_len, batch, n_dim): encoding matrix of hypothesis
        """
        batch_size = encoded_p.size(1)

        mask_p = torch.any(torch.ne(encoded_p, 0), axis=2)
        mask_h = torch.any(torch.ne(encoded_h, 0), axis=2)

        encoded_p = F.dropout(encoded_p, p=self.options["DROPOUT"], training=training)
        encoded_h = F.dropout(encoded_h, p=self.options["DROPOUT"], training=training)

        # RNN
        h_p_0, h_n_0 = self._init_hidden(batch_size)  # 1 x batch x n_dim
        o_p, _ = self._gru_forward(self.premise_gru, encoded_p, mask_p, h_p_0)
        o_h, _ = self._gru_forward(self.hypothesis_gru, encoded_h, mask_h, h_n_0)
        # Attention
        r_0 = self._attn_gru_init_hidden(batch_size)
        h_star, _ = self._attn_gru_forward(o_h, mask_h, r_0, o_p, mask_p)
        # Output layer
        h_star = self.out(h_star.cuda())
        if self.options["LAST_NON_LINEAR"]:
            h_star = F.relu(h_star)
        pred = F.log_softmax(h_star)

        return pred

In [85]:
options = {"HIDDEN_DIM": 300, "CLASSES_2_IX": 3, "DROPOUT":0.2, "LAST_NON_LINEAR": True, "CUDA": True}
mgru = mGRU(options)

In [86]:
criterion = torch.nn.CrossEntropyLoss().cuda()
param_optimizer = list(mgru.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
             ]
optim_params = {'lr': 2e-2, 'eps': 1e-6,} #'correct_bias': False}
optim = transformers.AdamW(optimizer_grouped_parameters, **optim_params)

In [87]:
scheduler = transformers.get_linear_schedule_with_warmup(optim, 10000, len(train_dl)*10)

In [88]:
def train_step(engine, batch):
    mgru.train()
    optim.zero_grad()
    y = batch["labels"].to(device)
    p_encode = model(**batch["p"])["last_hidden_state"].permute(1, 0, 2)
    h_encode = model(**batch["h"])["last_hidden_state"].permute(1, 0, 2)
    y_pred = mgru(p_encode, h_encode, training=True)
    loss = criterion(y_pred, y)
    loss.backward()
    # engine.fire_event(BackpropEvents.BACKWARD_COMPLETED)
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optim.step()
    # scheduler.step()
    return loss.item()


def validation_step(engine, batch):
    model.eval()
    y = batch["labels"].to(device)
    p_encode = model(**batch["p"])["last_hidden_state"].permute(1, 0, 2)
    h_encode = model(**batch["h"])["last_hidden_state"].permute(1, 0, 2)
    y_pred = mgru(p_encode, h_encode, training=False)
    return y_pred, y
    

def score_function(engine):
    return engine.state.metrics['accuracy']

In [None]:
log_interval = 10
pbar = tqdm(initial=0, leave=False, total=len(train_dl), desc=f"ITERATION - loss: {0:.2f}")

trainer = Engine(train_step)
# trainer.register_events(*BackpropEvents)

val_metrics = {
    "accuracy": Accuracy(),
    "loss": Loss(criterion)
}
evaluator = Engine(validation_step)
for name, metric in val_metrics.items():
    metric.attach(evaluator, name)

handler = EarlyStopping(patience=5, score_function=score_function, trainer=trainer)
evaluator.add_event_handler(Events.COMPLETED, handler)

@trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
def log_training_loss(engine):
    # print("Epoch[{}] Loss: {:.2f}".format(trainer.state.epoch, trainer.state.output))
    pbar.desc = f"ITERATION - loss: {engine.state.output:.2f}"
    pbar.update(log_interval)

@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
    evaluator.run(test_dl)
    metrics = evaluator.state.metrics
    tqdm.write("Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
          .format(trainer.state.epoch, metrics["accuracy"], metrics["loss"]))

    pbar.n = pbar.last_print_n = 0


# @evaluator.on(Events.EPOCH_COMPLETED)
# def reduct_step(engine):
#     scheduler.step()


@trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
def log_time(engine):
    tqdm.write(f"{trainer.last_event_name.name} took {trainer.state.times[trainer.last_event_name.name]} seconds")

trainer.run(train_dl, 10)
pbar.close()