In [None]:
!pip install transformers datasets
!pip install conllu
!pip3 install torchvision
!pip install scikit-learn

In [1]:
import sys
sys.path.append('../API')

import api

import torch
import torch.nn as nn

import numpy

from transformers import AutoTokenizer, AutoModel

from sklearn.metrics import precision_score, recall_score, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
db = api.connect_to_db()

Pinged your deployment. You successfully connected to MongoDB!


In [22]:
training_data, testing_data = api.get_database_content(db, .8, .2, 0)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

training_true_labels = []
for title in training_data:
    training_true_labels.append(title['label'])

testing_true_labels = []
for title in testing_data:
    testing_true_labels.append(title['label'])

[1, 1, 0, 1, 1, -1, 1, 1, 1, 1, 0, -1, 1, 0, 1, -1, 0, -1, 0, -1, 0, 1, 0, 0, 0, 1, 0, 1, -1, -1, 0, 0, -1, 1, -1, 0, 1, 0, 1, -1, -1, 1, 1, 1, 0, 1, 0, 0, 1, -1, 1, -1, -1, 1, 0, 0, -1, 0, 0, 1, 0, 1, -1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, -1, -1, 1, 1, -1, 1, -1, 1, 1, 0, 1, 1, 1, 0, 1, -1, -1, 0, 1, 1, 1, -1, 1, 0, 0, -1, -1, -1, 0, 1, -1, 1, 0, 0, -1, 1, 1, 0, 0, -1, -1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, -1, -1, 1, -1, 0, 1, 1, 0, -1, 1, -1, 0, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 0, 0, 0, 0, 0, -1, 1, 1, 0, 1, 1, 1, 0, -1, -1, 0, 0, 1, -1, 0, 0, -1, 0, 1, -1, 1, 1, -1, 0, 1, 1, -1, 0, 1, 0, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 0, 1, 1, -1, 1, 1, 1, 1, -1, 0, 1, 1, 1, -1, -1, 1, -1, -1, 0, 0, -1, -1, -1, 1, 0, 1, 0, -1, 1, 1, 1, 1, 0, -1, 0, 0, 0, 1, 1, 1, 0, -1, 0, 0, -1, 0, 0, -1, 0, -1, 0, 0, 0, 1, 1, 0, 1, -1, 0, 0, 0, 0, 1, -1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, -1, 0, 0, 0, 1, -1, 1, 1, 1, 1, -1, 1, 0, 1, 1, 1, -1, -1, 0, 1, -1, 0, 1, 0, 0, 0, 0, 1, -1, 1, 0, 0, -1, 0

In [7]:
class MyClassifier(nn.Module):
  def __init__(self, embedder, num_labels=2):
    super().__init__()
    self.num_labels = num_labels
    self.embedder = embedder
    self.fc = nn.Linear(embedder.embedding_size, num_labels)

  def forward(self, input):
    return self.fc(self.embedder(input))

In [14]:
class BinaryWordOverlapBatched:
  """return a binary vector the size of the vocab, indicating whether each
  word occurred in both the premise and hypothesis"""

  def __init__(self, vocab_size):
    self.vocab_size = vocab_size
    self.embedding_size = self.vocab_size

  def __call__(self, inputs):
    """assume inputs is list of instances of the dict returned by tokenize_rte function"""
    OH = lambda x: nn.functional.one_hot(torch.as_tensor(x), self.embedding_size)
    padder = lambda x: nn.utils.rnn.pad_sequence(x, batch_first=True)

    tokens_sent1 = padder([OH(inst['title']['input_ids'][0]) for inst in inputs])

    overlap = tokens_sent1.sum(axis=1)
    return overlap.float()

In [15]:
def tokenize_rte(data, tokenizer):
  tokenized = []
  for inst in data:
    n_inst = {
      'title': tokenizer(inst['title'], return_tensors='pt'),
      'orig_title': inst['title'],
      'label': inst['label'],
    }
    n_inst['str_tokenized_sentence'] = tokenizer.convert_ids_to_tokens(n_inst['title']['input_ids'][0])
    tokenized.append(n_inst)

  return tokenized

tokenized_training_data = tokenize_rte(training_data, tokenizer)
print(tokenized_training_data)

[{'title': {'input_ids': tensor([[  101,  2181,  1103, 27140,   170,  9640,  2369,   136,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}, 'orig_title': 'Is the IDF a terrorist organization?', 'label': 1, 'str_tokenized_sentence': ['[CLS]', 'Is', 'the', 'IDF', 'a', 'terrorist', 'organization', '?', '[SEP]']}, {'title': {'input_ids': tensor([[  101,  1262,  5871,  7941,  1209,  1129, 11289,  1111,   107,  4440,
           107,  1103, 21978,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}, 'orig_title': 'And hamas will be blamed for "breaking" the truce', 'label': 1, 'str_tokenized_sentence': ['[CLS]', 'And', 'ha', '##mas', 'will', 'be', 'blamed', 'for', '"', 'breaking', '"', 'the', 'truce', '[SEP]']}, {'title': {'input_ids': tensor([[  101,  8776,  8988,  2114,  1107,   160,   119,  2950,  7005, 16621,
     

In [16]:
def train(model, data_inst, data_labels, weight_adjuster, loss_fn,
          batch_size=100, num_epochs=5, epoch_callback=None):
  for epoch in range(num_epochs):
    print("Epoch %d" % epoch)
    batch_start = 0
    batch_end = min(batch_size, len(data_inst))
    num_batches = int(numpy.ceil(len(data_inst) / batch_size))
    for batch_idx, batch in enumerate(range(num_batches)):
      batch = data_inst[batch_start:batch_end]
      weight_adjuster.zero_grad()
      logits = model(batch)

      labels = data_labels[batch_start:batch_end]

      loss = loss_fn(input=logits, target=labels)
      print("Epoch %d, Batch %d: %d --> %d, Batch loss %f" % (epoch, batch_idx, batch_start, batch_end, loss.item()))
      loss.backward()
      weight_adjuster.step()

      batch_start = batch_end
      batch_end = batch_end+batch_size
      batch_end = min(batch_end, len(data_inst))

    if epoch_callback is not None:
      epoch_callback()

In [17]:
vocab_size = len(tokenizer)
embedder = BinaryWordOverlapBatched(vocab_size)

model = MyClassifier(embedder)

batches = tokenized_training_data[0:2]
logits = model(batches)

loss_fn = torch.nn.CrossEntropyLoss()

weight_adjuster = torch.optim.Adam(model.parameters())

In [21]:
train(model=model,
      data_inst=tokenized_training_data,
      data_labels=torch.tensor(training_true_labels),
      weight_adjuster=weight_adjuster,
      loss_fn=loss_fn,
      batch_size = 10,
      num_epochs = 1)

Epoch 0


IndexError: Target -1 is out of bounds.

In [None]:
def evaluate(model, true_labels, batch_size):
  dev_data = tokenize_rte(rte_dev, tokenizer)
  model.eval()
  model_eval = model(dev_data[0:30])

  predictions = []
  for prediction in model_eval:
    predictions.append(0 if prediction[0] > prediction[1] else 1)

  true_labels = true_labels[:batch_size]

  accuary = accuracy_score(true_labels, predictions)
  print(f"Accuracy = {accuary}")

  macro_precision = precision_score(true_labels, predictions, average='macro')
  macro_recall = recall_score(true_labels, predictions, average='macro')
  print(f"Macro Precision = {macro_precision}")
  print(f"Macro Recall = {macro_recall}")

  micro_precision = precision_score(true_labels, predictions, average='micro')
  micro_recall = recall_score(true_labels, predictions, average='micro')
  print(f"Micro Precision = {micro_precision}")
  print(f"Micro Recall = {micro_recall}")

evaluate(model, true_labels_dev, 30)