<a href="https://colab.research.google.com/github/PiotrusWatson/level4project/blob/master/transformer_like_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip
!unzip snli_1.0.zip

--2019-11-17 00:07:23--  https://nlp.stanford.edu/projects/snli/snli_1.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94550081 (90M) [application/zip]
Saving to: ‘snli_1.0.zip.2’

snli_1.0.zip.2        8%[>                   ]   7.91M  1.23MB/s    eta 79s    N
snli_1.0.zip.2       10%[=>                  ]   9.29M   812KB/s    eta 87s    

KeyboardInterrupt: ignored

In [0]:
# Download the Glove.zip file and expand it.
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip

In [0]:
import pandas as pd
import numpy as np

import torch,keras

from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.nn.parameter import Parameter
from torch.nn import init
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as data_utils
from torch import nn

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import roc_curve, auc

import math

In [0]:
train_dataframe = pd.read_json('./snli_1.0/snli_1.0_train.jsonl', lines=True)
test_dataframe = pd.read_json('./snli_1.0/snli_1.0_test.jsonl', lines=True)


In [0]:
def convert_to_lists(names_to_lists):
  for key in names_to_lists:
    names_to_lists[key] = names_to_lists[key].tolist()
  return names_to_lists

class Tokeniser:
  def __init__(self, texts, vocab_size, max_len):
    self.t = Tokenizer()
    self.max_len = max_len
    self.t.num_words = vocab_size
    
    full_corpus = []

    for index in texts:
      for text in texts[index]:
        full_corpus.append(text)
    
    self.t.fit_on_texts(full_corpus)

  def full_process(self, text):
    """OK SO: converts a list of strings into a list of numerical sequences
then pads them out so they're all a consistent size
then returns a numpy array of that :) """
    new_sequence = self.t.texts_to_sequences(text)
    padded_sequence = pad_sequences(new_sequence, maxlen=self.max_len, padding ="post")
    return np.array(padded_sequence, dtype=np.float32)

  def do_everything(self, texts):
    for index in texts:
      texts[index] = self.full_process(texts[index])
    self.word_to_id = self.t.word_index
    return texts



  
# Get the embedding matrix using Glove. 
vocab,word2idx = None,{}

def load_glove_embeddings(path, word2idx, embedding_dim):
    """Loading the glove embeddings"""
    vocab_size = len(word2idx) + 1
    print(vocab_size)
    with open(path) as f:
        embeddings = np.zeros((vocab_size, embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = word2idx.get(word)
            if index:
                vector = np.array(values[1:], dtype='float32')
                if vector.shape[-1] != embedding_dim:
                    raise Exception('Dimension not matching.')
                embeddings[index] = vector
        return torch.from_numpy(embeddings).float()

#assumption: we're going to only care about classification per text
def generate_indexes(labels):
  return [1 if label == "neutral" else 2 if label == "entailment" else 0 for label in labels]

index_to_label = ["contradiction","neutral","entailment"]


In [0]:
def evaluation_summary(description, predictions, true_labels):
  print("Evaluation for: " + description)
  precision = precision_score(predictions, true_labels, average='macro')
  recall = recall_score(predictions, true_labels, average='macro')
  accuracy = accuracy_score(predictions, true_labels)
  f1 = fbeta_score(predictions, true_labels, 1, average='macro') #1 means f_1 measure
  print("Classifier '%s' has Acc=%0.3f P=%0.3f R=%0.3f F1=%0.3f" % (description,accuracy,precision,recall,f1))
  print(classification_report(predictions, true_labels, digits=3))
  print('\nConfusion matrix:\n',confusion_matrix(true_labels, predictions))
  return precision,recall,accuracy,f1 

In [0]:
MAX_LENGTH = 150
VOCAB_SIZE = 20000
BATCH_SIZE = 128
SAMPLE_SAMPLE_SIZE = 4

chopped_train_dataframe = train_dataframe.sample(n=int(len(train_dataframe["sentence1"])/SAMPLE_SAMPLE_SIZE))
x_train_lists = convert_to_lists({"premise": chopped_train_dataframe["sentence1"], "hypothesis": chopped_train_dataframe["sentence2"]})
y_train_list = chopped_train_dataframe["gold_label"].tolist()

x_test_lists = convert_to_lists({"premise": test_dataframe["sentence1"], "hypothesis": test_dataframe["sentence2"]})
y_test_list = test_dataframe["gold_label"].tolist()


x_tokeniser = Tokeniser(x_train_lists, VOCAB_SIZE, MAX_LENGTH)



In [0]:
x_train = x_tokeniser.do_everything(x_train_lists)
x_test = x_tokeniser.do_everything(x_test_lists)
y_train = np.array(generate_indexes(y_train_list), dtype=np.float32)
y_test = np.array(generate_indexes(y_test_list), dtype=np.float32)


In [0]:
#alright lets tensordataset this boy
train_data = data_utils.TensorDataset(torch.from_numpy(x_train["premise"]).type(torch.LongTensor),
                                      torch.from_numpy(x_train["hypothesis"]).type(torch.LongTensor),
                                      torch.from_numpy(y_train).type(torch.LongTensor))

train_loader = data_utils.DataLoader(train_data, batch_size=BATCH_SIZE, drop_last=True)
test_data = data_utils.TensorDataset(torch.from_numpy(x_test["premise"]).type(torch.LongTensor),
                                      torch.from_numpy(x_test["hypothesis"]).type(torch.LongTensor),
                                      torch.from_numpy(y_test).type(torch.LongTensor))
test_loader = data_utils.DataLoader(test_data, batch_size=BATCH_SIZE, drop_last=True)

STUFF I STILL DON'T KNOW A THING ABOUT

In [0]:
class LayerNormalization(nn.Module):

    def __init__(self, features, epsilon=1e-8):
        '''Applies layer normalization. USE LATER
        Args:
          epsilon: A floating number. A very small number for preventing ZeroDivision Error.
        '''
        super(layer_normalization, self).__init__()
        self.epsilon = epsilon
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.epsilon) + self.beta

#todo: rewrite, understand, use
class PositionalEncoding(nn.Module):

    def __init__(self, num_units, zeros_pad=True, scale=True):
        '''Sinusoidal Positional_Encoding.
        Args:
          num_units: Output dimensionality
          zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
          scale: Boolean. If True, the output will be multiplied by sqrt num_units(check details from paper)
        '''
        super(PositionalEncoding, self).__init__()
        self.num_units = num_units
        self.zeros_pad = zeros_pad
        self.scale = scale

    def forward(self, inputs):
        # inputs: A 2d Tensor with shape of (N, T).
        N, T = inputs.size()[0: 2]

        # First part of the PE function: sin and cos argument
        position_ind = Variable(torch.unsqueeze(torch.arange(0, T), 0).repeat(N, 1).long())
        position_enc = torch.Tensor([
            [pos / np.power(10000, 2. * i / self.num_units) for i in range(self.num_units)]
            for pos in range(T)])

        # Second part, apply the cosine to even columns and sin to odds.
        position_enc[:, 0::2] = torch.sin(position_enc[:, 0::2])  # dim 2i
        position_enc[:, 1::2] = torch.cos(position_enc[:, 1::2])  # dim 2i+1

        # Convert to a Variable
        lookup_table = Variable(position_enc)

        if self.zeros_pad:
            lookup_table = torch.cat((Variable(torch.zeros(1, self.num_units)),
                                     lookup_table[1:, :]), 0)
            padding_idx = 0
        else:
            padding_idx = -1

        outputs = F.embedding(
            position_ind, lookup_table, padding_idx, None, 2, False, False)   # copied from torch.nn.modules.sparse.py

        if self.scale:
            outputs = outputs * self.num_units ** 0.5

        return outputs
  

real shit: multi head attention :)


basic feedforward network


In [0]:
class Forwarder(nn.Module):
  #incredibly basic feedforward network
  #two linear changes
  def __init__(self, in_channels, num_units=[2048, 512], dropout=0.1):

    super(Forwarder, self).__init__()
    self.in_channels = in_channels
    self.num_units = num_units

    self.conv1 = nn.Sequential(nn.Linear(self.in_channels, self.num_units[0]), nn.ReLU())
    self.conv2 = nn.Linear(self.num_units[0], self.num_units[1])
    self.norm1 = nn.LayerNorm(self.in_channels)


  def forward(self, inputs):
    
    conv1_outputs = self.conv1(inputs)
    conv2_outputs = self.conv2(conv1_outputs)

    conv2_outputs += inputs
    
    outputs = self.norm1(conv2_outputs)

    #normalise here

    return outputs

hyperparams: set them HERE

In [115]:
class Hyperparameters:
  attention_size = 300 #hidden/num_units
  num_heads = 5
  dropout = 0.1
  num_classes = 3
  num_blocks = 6
  epochs = 12
  sinusoid = True

glove_embeddings = load_glove_embeddings("glove.6B.{}d.txt".format(Hyperparameters.attention_size),x_tokeniser.word_to_id,Hyperparameters.attention_size)

22184


putting everything together, here's an attempt at a basic entailment model

In [0]:
class BasicEntailmentModel(nn.Module):
  def __init__(self,
               max_length,
               batch_size, 
               word_embeddings,
               vocab_size,
               hp):
    super(BasicEntailmentModel, self).__init__()
    self.hp = hp
    self.premise_embeddings = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=hp.attention_size)
    self.premise_embeddings.weight = nn.Parameter(word_embeddings)
    self.premise_dropout = nn.Dropout(hp.dropout)
    self.premise_norm = nn.LayerNorm(hp.attention_size)


    self.premise_positional_encoding = PositionalEncoding(num_units=hp.attention_size,
                                                          zeros_pad=False,
                                                          scale=False)

    self.hypothesis_embeddings = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=hp.attention_size)
    self.hypothesis_embeddings.weight = nn.Parameter(word_embeddings)
    self.hypothesis_dropout = nn.Dropout(hp.dropout)

    self.hypothesis_positional_encoding = PositionalEncoding(num_units=hp.attention_size,
                                                          zeros_pad=False,
                                                          scale=False)
    self.hypothesis_norm1 = nn.LayerNorm(hp.attention_size)
    self.word_embeddings_size = word_embeddings.size(1)
    self.batch_size = batch_size


    
    #create a buncha attributes with THIS HACKY CODE -> attention into feedforward
    for i in range(self.hp.num_blocks):
      self.__setattr__('premise_self_attention%d' % i, nn.MultiheadAttention(hp.attention_size,
                                                                           num_heads=hp.num_heads,
                                                                           dropout=hp.dropout))
      self.__setattr__('premise_feed_forward%d' % i, Forwarder(in_channels=hp.attention_size,
                                                               num_units = [4*hp.attention_size,
                                                                            hp.attention_size]))
    
    #same for hypothesis
    for i in range(self.hp.num_blocks):
      self.__setattr__('hypothesis_self_attention%d' % i, nn.MultiheadAttention(hp.attention_size,
                                                                           num_heads=hp.num_heads,
                                                                           dropout=hp.dropout))
      self.__setattr__('hypothesis_feed_forward%d' % i, Forwarder(in_channels=hp.attention_size,
                                                               num_units = [4*hp.attention_size,
                                                                           hp.attention_size]))
      
    self.mlp1 = nn.Linear(hp.attention_size * max_length, 20)
    self.mlp2 = nn.Linear(20, hp.num_classes)
    
  def forward(self, premise_words, hypothesis_words):
    self.premise = self.premise_embeddings(premise_words)
    self.premise += self.premise_positional_encoding(premise_words)
    self.premise = self.premise_dropout(self.premise.float())

    
    for i in range(self.hp.num_blocks):
      self.premise, _ = self.__getattr__('premise_self_attention%d' % i)(self.premise, 
                                                                      self.premise, 
                                                                      self.premise)
      self.premise = self.__getattr__('premise_feed_forward%d' % i)(self.premise)

    self.premise = self.premise_norm(premise)
    self.hypothesis = self.hypothesis_embeddings(hypothesis_words)
    self.hypothesis += self.hypothesis_positional_encoding(hypothesis_words)

    self.hypothesis = self.hypothesis_dropout(self.hypothesis.float())

    for i in range(self.hp.num_blocks):
      self.hypothesis, _ = self.__getattr__('hypothesis_self_attention%d' % i)(self.hypothesis, 
                                                                      self.hypothesis, 
                                                                      self.hypothesis)
      self.hypothesis = self.__getattr__('hypothesis_feed_forward%d' % i)(self.hypothesis)

    self.hypothesis = self.hypothesis_norm(self.hypothesis)
    
    self.mush = self.premise * self.hypothesis
    self.mush = self.mush.reshape(self.batch_size, -1)
    self.mush = self.mlp1(self.mush)
    self.output = torch.softmax(self.mlp2(self.mush),-1)
    return self.output

    





training function


In [0]:
def train(model=None, 
          train_loader=None, 
          loss_function=None, 
          optimiser=None, 
          epochs=5, 
          using_gradient_clipping=False):
  
  losses = []
  accuracies = []
  
  for epoch in range(epochs):
    print("Running EPOCH:",epoch+1)
    total_loss = 0
    batch_count = 0
    correct = 0

    for batch_index, train_data in enumerate(train_loader):
      premise = Variable(train_data[0]).cuda()
      hypothesis = Variable(train_data[1]).cuda()
      actual_y = Variable(train_data[2]).cuda()
      
      torch.cuda.synchronize()
      optimiser.zero_grad()
      predicted_y = model(premise, hypothesis)
      squeezed_y = predicted_y.double().squeeze(1)
      loss = loss_function(squeezed_y,actual_y.long())

      correct += torch.eq(torch.argmax(squeezed_y, 1), actual_y).data.sum()
      
      if (batch_index == 1):
        print(total_loss, correct.data.cpu().numpy().astype(int)/train_loader.batch_size)
      total_loss += loss.data

      #woah we gotta do this to do backprop!!!
      optimiser.zero_grad()
      loss.backward()
      torch.cuda.synchronize()

      if using_gradient_clipping:
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.5)
      batch_count += 1
      optimiser.step()

    print("Average loss is:",total_loss/batch_count)
    correct_but_numpy = correct.data.cpu().numpy().astype(int)
    accuracy = correct_but_numpy / float(batch_count * train_loader.batch_size)
    print("Accuracy of the model", accuracy)
    losses.append(total_loss/batch_count)
    accuracies.append(accuracy)
  return losses, accuracies

test function

In [0]:
def evaluate(model, test_premise, test_hypothesis, test_y):

  model.batch_size = test_hypothesis.shape[0] #why
  model = model.cpu()
  premise_variable = Variable(torch.from_numpy(test_premise).type(torch.LongTensor))
  hypothesis_variable = Variable(torch.from_numpy(test_hypothesis).type(torch.LongTensor))
  y_actual_variable = Variable(torch.from_numpy(test_y).type(torch.DoubleTensor))
  y_predicted = model(premise_variable, hypothesis_variable)
  y_predicted_rounded = torch.round(y_predicted.type(torch.DoubleTensor).squeeze(1))

  test_data_count = premise_variable.size(0)



  total_accuracy = torch.eq(torch.argmax(y_predicted,1), y_actual_variable).data.sum()
  average_accuracy = total_accuracy.data.cpu().numpy().astype(int)/float(test_data_count)
  return average_accuracy, torch.argmax(y_predicted,1)

  

ok lets run


In [124]:
model = BasicEntailmentModel(max_length = MAX_LENGTH,
                             batch_size=BATCH_SIZE,
                             vocab_size=VOCAB_SIZE,
                             word_embeddings=glove_embeddings,
                             hp=Hyperparameters
                             )
model = model.cuda()
loss = torch.nn.CrossEntropyLoss()
optimiser = torch.optim.Adagrad(model.parameters(), 
                                    lr=0.01)
loss, accuracy = train(model=model,
                       train_loader=train_loader,
                       loss_function=loss,
                       optimiser = optimiser,
                       epochs = 5,
                       using_gradient_clipping=True
                      )

Running EPOCH: 1


RuntimeError: ignored

testan


In [0]:
accuracy, predicted_ys = evaluate(model, 
         test_hypothesis=x_test["hypothesis"],
         test_premise=x_test["premise"],
         test_y=y_test)