In [1]:
!pip install transformers datasets
!pip install conllu
!pip3 install torchvision
!pip install scikit-learn



In [2]:
import sys
sys.path.append('../API')

import api

import torch
import torch.nn as nn

import numpy

from transformers import AutoTokenizer, AutoModel

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Connect to DB with your credentials
# Syntax: db = api.connect_to_db("username","password")
db = api.connect_to_db("Franco","C4jtOqQAfyM3bGF1")

Pinged your deployment. You successfully connected to MongoDB!


In [4]:
# API call to get training and testing data
# 80/20 train/test split with randomizing to avoid biased training
training_data, testing_data = api.get_database_content(db, .8, .2, 1)

# Training Data Labels
training_features = []
training_true_labels = []
for item in training_data:
    training_features.append(item['title'])
    training_true_labels.append(item['label'])

# Testing Data Labels
testing_features = []
testing_true_labels = []
for item in testing_data:
    testing_features.append(item['title'])
    testing_true_labels.append(item['label'])

print(training_features)
print(training_true_labels)
print(testing_features)
print(testing_true_labels)

[1, 0, 0, -1, -1, -1, 0, 1, -1, 0, -1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, 0, 0, 1, -1, 0, 0, -1, -1, 0, 0, -1, 1, -1, -1, 1, 1, 1, 1, 0, -1, 1, -1, 1, 0, -1, 0, 1, 1, 0, 0, 1, -1, 0, 0, 0, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, -1, 0, 0, -1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, -1, 1, 0, 1, -1, 0, 0, 1, -1, 1, 1, -1, -1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 0, 1, 0, 1, 1, -1, 1, 1, -1, 1, 1, 0, 1, -1, 0, 0, 1, -1, 0, 0, 0, 1, 1, -1, -1, 1, 1, 0, -1, 0, 1, 0, 0, 1, -1, 1, 0, -1, 0, 0, 0, 1, -1, 0, -1, 1, 1, -1, 1, -1, 1, 0, 1, 1, 1, 1, 0, -1, 0, -1, 1, 1, 0, 0, 0, 1, -1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, -1, 1, 0, -1, 1, -1, 0, 1, -1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, -1, -1, 0, -1, 1, 1, 0, 1, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 1, -1, 1, 0, 1, -1, 0, 0, -1, 0, -1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, -1, 1, 0, -1, -1, -1, 0, 1, 0, 1, 0, 0, 1, 

In [5]:
# Convert the -1, 0, 1 labels to 0, 1, 2 for Pytorch

def map_labels(labels):
    return [label + 1 for label in labels]

training_true_labels = map_labels(training_true_labels)
testing_true_labels = map_labels(testing_true_labels)

print(training_features)
print(training_true_labels)
print(testing_features)
print(testing_true_labels)

[2, 1, 1, 0, 0, 0, 1, 2, 0, 1, 0, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 1, 0, 2, 0, 2, 1, 0, 1, 2, 2, 1, 1, 2, 0, 1, 1, 1, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 0, 1, 1, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 0, 2, 1, 2, 0, 1, 1, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 1, 2, 1, 2, 2, 0, 2, 2, 0, 2, 2, 1, 2, 0, 1, 1, 2, 0, 1, 1, 1, 2, 2, 0, 0, 2, 2, 1, 0, 1, 2, 1, 1, 2, 0, 2, 1, 0, 1, 1, 1, 2, 0, 1, 0, 2, 2, 0, 2, 0, 2, 1, 2, 2, 2, 2, 1, 0, 1, 0, 2, 2, 1, 1, 1, 2, 0, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 0, 2, 1, 0, 2, 0, 1, 2, 0, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 0, 0, 1, 0, 2, 2, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2, 0, 2, 1, 2, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 0, 2, 1, 0, 0, 0, 1, 2, 1, 2, 1, 1, 2, 2, 0, 2, 1, 1, 2, 0, 2, 1, 1, 2, 2, 2, 1, 2, 0, 1, 2, 2, 0, 1, 0, 1, 2, 2, 

In [6]:
class MyClassifier(nn.Module):
    def __init__(self, num_labels=3):
        super(MyClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("bert-base-cased")
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        return self.fc(pooled_output)


In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
vocab_size = len(tokenizer)

def tokenize_rte(text_data, labels, tokenizer):
    tokenized_data = []

    for text, label in zip(text_data, labels):
        encoded_input = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        tokenized_instance = {
            'input_ids': encoded_input['input_ids'][0],
            'attention_mask': encoded_input['attention_mask'][0],
            'label': label,
            'original_text': text
        }
        tokenized_data.append(tokenized_instance)

    return tokenized_data

tokenized_training_data = tokenize_rte(training_features, training_true_labels, tokenizer)
print(tokenized_training_data)


[{'input_ids': tensor([  101,  2009,  1202,  1177,  1242,  1104,  1128,  9762,  1115,  1103,
         1346, 23726,  1116,  6290,  1884,  4934,  2200,  8619,   136,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,

In [8]:
# Define training function
def train(model, data_inst, data_labels, weight_adjuster, loss_fn,
          batch_size=100, num_epochs=5, epoch_callback=None):
    data_labels = torch.tensor(data_labels, dtype=torch.long)

    for epoch in range(num_epochs):
        print("Epoch %d" % epoch)
        num_batches = int(torch.ceil(torch.tensor(len(data_inst) / batch_size)))

        for batch_idx in range(num_batches):
            batch_start = batch_idx * batch_size
            batch_end = min(batch_start + batch_size, len(data_inst))

            # Extract batch data
            batch = data_inst[batch_start:batch_end]
            input_ids = torch.stack([b['input_ids'] for b in batch])
            attention_mask = torch.stack([b['attention_mask'] for b in batch])
            batch_labels = data_labels[batch_start:batch_end]

            weight_adjuster.zero_grad()

            # Forward pass
            logits = model(input_ids, attention_mask)

            loss = loss_fn(logits, batch_labels)
            print(f"Epoch {epoch}, Batch {batch_idx}: {batch_start} --> {batch_end}, Batch loss {loss.item()}")

            loss.backward()
            weight_adjuster.step()

        if epoch_callback is not None:
            epoch_callback()

    return model



model = MyClassifier()
loss_fn = torch.nn.CrossEntropyLoss()
weight_adjuster = torch.optim.Adam(model.parameters())
# Run the training
train(model=model,
      data_inst=tokenized_training_data,
      data_labels=training_true_labels,
      weight_adjuster=weight_adjuster,
      loss_fn=loss_fn,
      batch_size=10,
      num_epochs=5)


Epoch 0
Epoch 0, Batch 0: 0 --> 10, Batch loss 1.110385537147522
Epoch 0, Batch 1: 10 --> 20, Batch loss 1.5342700481414795
Epoch 0, Batch 2: 20 --> 30, Batch loss 1.3648701906204224
Epoch 0, Batch 3: 30 --> 40, Batch loss 1.1011905670166016
Epoch 0, Batch 4: 40 --> 50, Batch loss 1.1739366054534912
Epoch 0, Batch 5: 50 --> 60, Batch loss 1.2167785167694092
Epoch 0, Batch 6: 60 --> 70, Batch loss 1.322711706161499
Epoch 0, Batch 7: 70 --> 80, Batch loss 0.8827813267707825
Epoch 0, Batch 8: 80 --> 90, Batch loss 1.398803949356079
Epoch 0, Batch 9: 90 --> 100, Batch loss 1.390616774559021
Epoch 0, Batch 10: 100 --> 110, Batch loss 0.872087836265564
Epoch 0, Batch 11: 110 --> 120, Batch loss 1.1501253843307495
Epoch 0, Batch 12: 120 --> 130, Batch loss 0.8350754976272583
Epoch 0, Batch 13: 130 --> 140, Batch loss 1.0988976955413818
Epoch 0, Batch 14: 140 --> 150, Batch loss 1.0536905527114868
Epoch 0, Batch 15: 150 --> 160, Batch loss 1.055537462234497
Epoch 0, Batch 16: 160 --> 170, Batc

MyClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [9]:

def evaluate(model, data_inst, data_labels, batch_size=100):
    model.eval()  # Set the model to evaluation mode

    predictions = []
    actuals = []

    with torch.no_grad():  # No need to track gradients during evaluation
        num_batches = int(torch.ceil(torch.tensor(len(data_inst) / batch_size)))

        for batch_idx in range(num_batches):
            batch_start = batch_idx * batch_size
            batch_end = min(batch_start + batch_size, len(data_inst))

            batch = data_inst[batch_start:batch_end]
            input_ids = torch.stack([b['input_ids'] for b in batch])
            attention_mask = torch.stack([b['attention_mask'] for b in batch])
            labels = torch.tensor(data_labels[batch_start:batch_end], dtype=torch.long)

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1).tolist()  # Convert logits to predicted class indices

            predictions.extend(preds)
            actuals.extend(labels.tolist())

    # Calculate evaluation metrics
    accuracy = accuracy_score(actuals, predictions)
    precision = precision_score(actuals, predictions, average='macro')
    recall = recall_score(actuals, predictions, average='macro')
    f1 = f1_score(actuals, predictions, average='macro')

    return accuracy, precision, recall, f1


tokenized_testing_data = tokenize_rte(testing_features, testing_true_labels, tokenizer)
# Example usage
accuracy, precision, recall, f1 = evaluate(model, tokenized_testing_data, testing_true_labels, batch_size=10)
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Accuracy: 0.3389830508474576, Precision: 0.11299435028248588, Recall: 0.3333333333333333, F1 Score: 0.1687763713080169


  _warn_prf(average, modifier, msg_start, len(result))
