<a href="https://colab.research.google.com/github/PaulMierau/natural-language-inferencing-using-pytorch/blob/main/nli_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Set your drive path to datasets

In [None]:
# Important:
# Download the train.csv and test.csv from https://www.kaggle.com/c/contradictory-my-dear-watson/data
# and make sure they are uploaded to your working direction, specified below
# !!! Running this notebook might require a colab pro account with high-ram runtime selection !!!
work_dir = "/content/drive/MyDrive/Intro2DL/"

In [None]:
import numpy as np 
import pandas as pd 

%load_ext tensorboard
from tensorflow import summary
import tensorflow as tf
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
!pip3 install transformers
!pip3 install datasets
!pip3 install sentencepiece
from transformers import  XLMRobertaTokenizer, AutoTokenizer, AutoModelForSequenceClassification, AdamW
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import torch.optim as optim
from datasets import load_dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Define training dataset from train.csv and MNLI source

In [None]:
class SentencesDataset(Dataset):
    def __init__(self, data_path = work_dir + "train.csv", model_name = "joeddav/xlm-roberta-large-xnli", maxLength = 200, isTest=False):
      self.isTest = isTest
      data = pd.read_csv(data_path)
      tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
      self.encoded_data = tokenizer(list(data.premise.values), list(data.hypothesis.values),truncation=True, padding = True, return_tensors="pt")
      mask = torch.tensor([1 if entry[maxLength] == 1 else 0 for entry in self.encoded_data['input_ids']])
      for key in list(self.encoded_data.keys()):
        self.encoded_data[key] = self.encoded_data[key][torch.nonzero(mask)]
        self.encoded_data[key] = self.encoded_data[key].reshape(self.encoded_data[key].shape[0],self.encoded_data[key].shape[2])[:,:maxLength]
      if not isTest:
        self.encoded_data["labels"] = data.label.values[torch.nonzero(mask)]

    def __len__(self):
        return len(self.encoded_data["input_ids"])

    def __getitem__(self, idx):
      if self.isTest:
        return {"input_ids":self.encoded_data["input_ids"][idx],\
                "attention_mask": self.encoded_data["attention_mask"][idx]}


      return {"input_ids":self.encoded_data["input_ids"][idx],\
              "attention_mask": self.encoded_data["attention_mask"][idx],\
              "labels": self.encoded_data["labels"][idx]}

In [None]:
def load_mnli():
  data = {"hypotheses": [], "premises": [], "labels":[]}
  dataset = load_dataset("multi_nli")
  for idx, sample in enumerate(dataset["train"]):
    # choose number of training samples to avoid memory issues or speedup training
    if idx > 10000:
      break
    data["hypotheses"].append(sample["hypothesis"])
    data["premises"].append(sample["premise"])
    data["labels"].append(sample["label"])
  return data

class MnliDataset(Dataset):
    def __init__(self, model_name = "joeddav/xlm-roberta-large-xnli", maxLength = 200):
      data = load_mnli()
      tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
      self.encoded_data = tokenizer(list(data["premises"]), list(data["hypotheses"]),max_length=512 ,padding = "max_length",  return_tensors="pt",)
      mask = torch.tensor([1 if entry[maxLength] == 1 else 0 for entry in self.encoded_data['input_ids']])
      for key in list(self.encoded_data.keys()):
        self.encoded_data[key] = self.encoded_data[key][torch.nonzero(mask)]
        self.encoded_data[key] = self.encoded_data[key].reshape(self.encoded_data[key].shape[0],self.encoded_data[key].shape[2])[:,:maxLength]
      self.encoded_data["labels"] = np.array(data["labels"])[torch.nonzero(mask)]

    def __len__(self):
      return len(self.encoded_data["input_ids"])

    def __getitem__(self, idx):
      return {"input_ids":self.encoded_data["input_ids"][idx],\
                "attention_mask": self.encoded_data["attention_mask"][idx],\
                "labels": self.encoded_data["labels"][idx]}

# Initialize datasets and split into 80% for training and 20% for validation

In [None]:
batch_size = 64
mnli = MnliDataset()
sentences = SentencesDataset()
full_dataset = torch.utils.data.ConcatDataset([mnli, sentences])
train_size = int(0.8*len(full_dataset))
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset)- train_size])
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=True)

In [None]:
print(len(train_dataset), len(val_dataset))

#Model definition

In [None]:
import torch.nn as nn
import torch.nn.functional as F
class LSTMRoBerta(nn.Module):
  def __init__(self):
    super(LSTMRoBerta, self).__init__()
    self.roberta = AutoModelForSequenceClassification.from_pretrained("joeddav/xlm-roberta-large-xnli").base_model
    for param in self.roberta.parameters():
           param.requires_grad = False
    self.lstm = nn.LSTM(1024, 512, bidirectional=True, batch_first=True)
    self.classifier  = nn.Linear(1024, 3)
    
  def forward(self, input1, input2, hidden, train = True):
    if train == True:
      hidden = self.init_hidden(input1.shape[0])
    out1 = self.roberta(input_ids=input1, attention_mask=input2)[0]
    _, (h, c) = self.lstm(out1, hidden)
    out2 = torch.cat((h[0], h[1]), dim=1)
    pred = self.classifier(out2)
    hidden = (h, c)
    return F.softmax(pred, dim=1), hidden

  def init_hidden(self, batch_size):
        h = torch.zeros(2, batch_size, 512).to(device)
        c = torch.zeros(2, batch_size, 512).to(device)
        return (h, c)

# Create the model on the desired device and define the optimizer

In [None]:
device = 'cuda'
model = LSTMRoBerta()
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)

In [None]:
def trainLSTM(model, optimizer, sample, h, first = False):
  """one training step for LSTM which returns the loss, number of correct predictions and the last hidden state"""
  model.train()

  criterion = nn.CrossEntropyLoss()

  optimizer.zero_grad()
  with torch.autograd.set_detect_anomaly(True):
    input1 = sample["input_ids"].long().to(device)
    input2 = sample["attention_mask"].long().to(device)
    target = sample['labels'].long().to(device) 

    pred, h = model(input1, input2, h)

    pred_loss = criterion(pred, target.squeeze())
    _, predicted = torch.max(pred, 1) 
    num_correct = (predicted == target.squeeze()).sum().item()
    pred_loss.backward()
        
    optimizer.step()

  return pred_loss.item(), num_correct, h

In [None]:
def validateLSTM(model, sample, h):
  """one validation step for LSTM which returns the loss, number of correct predictions and the last hidden state"""
  model.eval()

  criterion = nn.CrossEntropyLoss()

  with torch.no_grad():
      input1 = sample["input_ids"].long().to(device)
      input2 = sample["attention_mask"].long().to(device)

      target = sample['labels'].long().to(device) 
      
      pred, h = model(input1, input2, h)
      pred_loss = criterion(pred, target[0])
      _, predicted = torch.max(pred, 1) 
      num_correct = (predicted  == target).sum().item()

  return pred_loss.item(), num_correct, h

## Use Tensorboard for live visualization of the training process

In [None]:
train_log_dir = './runs/train'
train_summary_writer = summary.create_file_writer(train_log_dir)
val_log_dir = './runs/validate'
val_summary_writer = summary.create_file_writer(val_log_dir)

In [None]:
%tensorboard --logdir runs

# Training loop

In [None]:
max_epoch = 5
save_stride = 10
max_accu = -1

for epoch in tqdm(range(max_epoch)):        
    train_loss = 0.0
    train_accu = 0.0

    first = True
    h = model.init_hidden(batch_size)
    with tqdm(total=len(train_dataloader)) as pbar:
        for idx, sample in enumerate(train_dataloader):
            h = tuple([each.data for each in h])
            curr_loss, num_correct, h = trainLSTM(model, optimizer, sample, h, first)
            train_loss += curr_loss / len(train_dataloader)
            train_accu += num_correct / len(train_dataset)
            pbar.update(1)
            first = False

    with train_summary_writer.as_default():
        tf.summary.scalar('loss', train_loss, step=epoch)                
        tf.summary.scalar('accuracy', train_accu, step=epoch)                

    checkpoint = {
        #'model' : DenseRoBerta(),
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }
    
    val_loss = 0.0
    val_accu = 0.0

    h = model.init_hidden(1)
    with tqdm(total=len(val_dataloader)) as pbar:
        for idx, sample in enumerate(val_dataloader):
            h = tuple([each.data for each in h])
            curr_loss, num_correct, h = validateLSTM(model, sample, h)
            val_loss += curr_loss / len(val_dataloader)
            val_accu += num_correct / len(val_dataloader)
            pbar.update(1)

    with val_summary_writer.as_default():
        tf.summary.scalar('loss', val_loss, step=epoch)
        tf.summary.scalar('accuracy', val_accu, step=epoch) 

    max_accu = max(val_accu, max_accu)
    if max_accu == val_accu:
        torch.save(checkpoint, os.path.join(work_dir, 'roberta_dense_best.pth'))

    print(train_accu, val_accu)

# Define test dataset from test.csv

In [None]:
class SentencesTestDataset(Dataset):
    def __init__(self, data_path = work_dir + "test.csv", model_name = "joeddav/xlm-roberta-large-xnli", max_length=512):
      data = pd.read_csv(data_path)
      tokenizer = AutoTokenizer.from_pretrained(model_name)
      mask = torch.ones(5195)
      self.encoded_data = tokenizer(list(data.premise.values), list(data.hypothesis.values),truncation=True, padding = True, return_tensors="pt")
      for key in list(self.encoded_data.keys()):
        self.encoded_data[key] = self.encoded_data[key][torch.nonzero(mask)]
        self.encoded_data[key] = self.encoded_data[key].reshape(self.encoded_data[key].shape[0],self.encoded_data[key].shape[2])

    def __len__(self):
        return len(self.encoded_data["input_ids"])

    def __getitem__(self, idx):
        return {"input_ids":self.encoded_data["input_ids"][idx],\
                "attention_mask": self.encoded_data["attention_mask"][idx]}

In [None]:
test_dataset = SentencesTestDataset()
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Test loop and create submission for kaggle challenge

In [None]:
predictions = []
test_loss = 0.0
test_accu = 0.0
checkpoint = torch.load(work_dir + "roberta_dense_best.pth")
model.load_state_dict(checkpoint['model_state_dict'])
h = model.init_hidden(batch_size = batch_size)
with tqdm(total=len(test_dataloader)) as pbar:
        for idx, sample in enumerate(test_dataloader):
            h = tuple([each.data for each in h])
            with torch.no_grad():
              model.eval()
              criterion = nn.CrossEntropyLoss()
              input1 = sample["input_ids"].long().to(device)
              input2 = sample["attention_mask"].long().to(device)

              pred, h = model(input1, input2, h)
              _, predicted = torch.max(pred, 1)
              predictions.append(predicted.item())
              pbar.update(1)


In [None]:
# The file submission.csv can be submitted on kaggle as a solution to the challenge

data = pd.read_csv(work_dir + 'test.csv")
submission = data.id.copy().to_frame()
submission['prediction'] = predictions
submission.to_csv("submission.csv", index = False)