In [7]:
!pip install transformers datasets
!pip install conllu
!pip3 install torchvision
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Downloading scipy-1.11.4-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.4 kB ? eta -:--:--
     -------------------- ------------------- 30.7/60.4 kB 1.4 MB/s eta 0:00:01
     ---------------------------------------- 60.4/60.4 kB 1.1 MB/s eta 0:00:00
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl (9.2 MB)
   ---------------------------------------- 0.0/9.2 MB ? eta -:--:--
   - -------------------------------------- 0.4/9.2 MB 7.4 MB/s eta 0:00:02
   -- ------------------------------------- 0.6/9.2 MB 9.1 MB/s eta 0:00:01
   ---- ------------------------------

In [8]:
import sys
sys.path.append('../API')

import api

import torch
import torch.nn as nn

import numpy

from transformers import AutoTokenizer, AutoModel

from sklearn.metrics import precision_score, recall_score, accuracy_score

In [16]:
db = api.connect_to_db()

KeyError: 'DB_USERNAME'

In [None]:
training_data, testing_data = api.get_database_content(db, .8, .2, 0)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

true_labels_dev = []
for i in range(len(rte_dev)):
  true_labels_dev.append(rte_dev[i]['label'])

true_labels_train = []
for i in range(len(rte_train)):
  true_labels_train.append(rte_train[i]['label'])

In [None]:
class MyClassifier(nn.Module):
  def __init__(self, embedder, num_labels=2):
    super().__init__()
    self.num_labels = num_labels
    self.embedder = embedder
    self.fc = nn.Linear(embedder.embedding_size, num_labels)

  def forward(self, input):
    return self.fc(self.embedder(input))

In [None]:
class BinaryWordOverlapBatched:
  """return a binary vector the size of the vocab, indicating whether each
  word occurred in both the premise and hypothesis"""

  def __init__(self, vocab_size):
    self.vocab_size = vocab_size
    self.embedding_size = self.vocab_size

  def __call__(self, inputs):
    """assume inputs is list of instances of the dict returned by tokenize_rte function"""
    OH = lambda x: nn.functional.one_hot(torch.as_tensor(x), self.embedding_size)
    padder = lambda x: nn.utils.rnn.pad_sequence(x, batch_first=True)

    tokens_sent1 = padder([OH(inst['sentence1']['input_ids'][0]) for inst in inputs])
    tokens_sent2 = padder([OH(inst['sentence2']['input_ids'][0]) for inst in inputs])

    overlap = tokens_sent1.sum(axis=1) * tokens_sent2.sum(axis=1)
    return overlap.float()

In [None]:
def tokenize_rte(data, tokenizer):
  tokenized = []
  for inst in data:
    n_inst = {
      'sentence1': tokenizer(inst['sentence1'], return_tensors='pt'),
      'orig_sentence1': inst['sentence1'],
      'sentence2': tokenizer(inst['sentence2'], return_tensors='pt'),
      'orig_sentence2': inst['sentence2'],
      'label': inst['label'],
      'idx': inst['idx']
    }
    n_inst['str_tokenized_sentence1'] = tokenizer.convert_ids_to_tokens(n_inst['sentence1']['input_ids'][0])
    n_inst['str_tokenized_sentence2'] = tokenizer.convert_ids_to_tokens(n_inst['sentence2']['input_ids'][0])
    tokenized.append(n_inst)

  return tokenized

tokenized_rte_train = tokenize_rte(rte_train, tokenizer)

In [None]:
def train(model, data_inst, data_labels, weight_adjuster, loss_fn,
          batch_size=100, num_epochs=5, epoch_callback=None):
  for epoch in range(num_epochs):
    print("Epoch %d" % epoch)
    batch_start = 0
    batch_end = min(batch_size, len(data_inst))
    num_batches = int(numpy.ceil(len(data_inst) / batch_size))
    for batch_idx, batch in enumerate(range(num_batches)):
      batch = data_inst[batch_start:batch_end]
      weight_adjuster.zero_grad()
      logits = model(batch)

      labels = data_labels[batch_start:batch_end]

      loss = loss_fn(input=logits, target=labels)
      print("Epoch %d, Batch %d: %d --> %d, Batch loss %f" % (epoch, batch_idx, batch_start, batch_end, loss.item()))
      loss.backward()
      weight_adjuster.step()

      batch_start = batch_end
      batch_end = batch_end+batch_size
      batch_end = min(batch_end, len(data_inst))

    if epoch_callback is not None:
      epoch_callback()

In [None]:
vocab_size = len(tokenizer)
embedder = BinaryWordOverlapBatched(vocab_size)

model = MyClassifier(embedder)

batches = tokenized_rte_train[0:2]
logits = model(batches)

loss_fn = torch.nn.CrossEntropyLoss()

weight_adjuster = torch.optim.Adam(model.parameters())

In [None]:
train(model=model,
      data_inst=tokenized_rte_train,
      data_labels=torch.tensor(true_labels_train),
      weight_adjuster=weight_adjuster,
      loss_fn=loss_fn,
      batch_size = 50,
      num_epochs = 3)

In [None]:
def evaluate(model, true_labels, batch_size):
  dev_data = tokenize_rte(rte_dev, tokenizer)
  model.eval()
  model_eval = model(dev_data[0:30])

  predictions = []
  for prediction in model_eval:
    predictions.append(0 if prediction[0] > prediction[1] else 1)

  true_labels = true_labels[:batch_size]

  accuary = accuracy_score(true_labels, predictions)
  print(f"Accuracy = {accuary}")

  macro_precision = precision_score(true_labels, predictions, average='macro')
  macro_recall = recall_score(true_labels, predictions, average='macro')
  print(f"Macro Precision = {macro_precision}")
  print(f"Macro Recall = {macro_recall}")

  micro_precision = precision_score(true_labels, predictions, average='micro')
  micro_recall = recall_score(true_labels, predictions, average='micro')
  print(f"Micro Precision = {micro_precision}")
  print(f"Micro Recall = {micro_recall}")

evaluate(model, true_labels_dev, 30)