## Imports

In [1]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, load_metric
import datetime
import os
import time
import numpy as np
import evaluate
import accelerate
import torch

## Import Data

In [3]:
print(os.getcwd())

/opt/jupyterlab/notebooks/DogBERT/Classifiers/Pseudomonas_Otitis/Binary_Classifier/Binary_Weighted_Loss


In [4]:
df_psoe = pd.read_excel('adamwilliams-OtitisStudyPseudomonas (1).xls', sheet_name='Case Data', index_col=False)

In [5]:
print(df_psoe.head())

   Unnamed: 0            searchterm__name  savsnet_consult_id  \
0           0  Pseudomonas and ear/otitis               95721   
1           1  Pseudomonas and ear/otitis              129032   
2           2  Pseudomonas and ear/otitis              169684   
3           3  Pseudomonas and ear/otitis              117477   
4           4  Pseudomonas and ear/otitis              115552   

   narrative_item_id PseudomonasOtitis PseudomonasOtitis related_text species  
0            2040703                 ?                            NaN     dog  
1            2041630                 ✓                            NaN     dog  
2            2042546                 ✓                            NaN     dog  
3            2044364                 ✓                            NaN     dog  
4            2049428                 !                            NaN     dog  


In [6]:
os.chdir('../../../../../savsnet_resources/pickles')

In [7]:
df_narratives = pd.read_pickle('narrative_pickle.pkl.gz', compression='gzip').drop_duplicates(subset='savsnet_consult_id', keep='first')

In [8]:
os.chdir('../../DogBERT/Classifiers/Pseudomonas_Otitis/Binary_Classifier/Binary_Weighted_Loss')

In [9]:
# Filter for just dog records
df_psoe = df_psoe[df_psoe.species == 'dog']

# Remove unclassified records
df_psoe = df_psoe.dropna(subset=['PseudomonasOtitis'])

"""
Create Dataset
"""
# Join in tick labels
df_dataset = pd.merge(df_narratives, df_psoe, on='savsnet_consult_id', how='inner')

In [10]:
print(df_dataset.head())

                                           item_text  consult_record       pk  \
0  "<<identifier>> otitis externa been really goo...          231003  2040703   
1  "HL issues and ears. been slowing up last few ...          231674  2041630   
2  "bilateral oe thickened ear canals cleaning is...          232326  2042546   
3  "left eear purulent otitis suspecrt pseudomona...          233568  2044364   
4  "Bilat OE again. Not purulent or ulcerated, so...          237069  2049428   

        consult_record_date  savsnet_consult_id  Unnamed: 0  \
0 2014-06-30 17:09:38+00:00               95721           0   
1 2014-07-26 11:24:00+00:00              129032           1   
2 2014-08-29 16:22:44+00:00              169684           2   
3 2014-07-16 19:29:10+00:00              117477           3   
4 2014-07-15 17:17:57+00:00              115552           4   

             searchterm__name  narrative_item_id PseudomonasOtitis  \
0  Pseudomonas and ear/otitis            2040703                

In [11]:
# Change labels to make it a binary classifier
replacements = {'⍉':'x', '?':'x', '!':'x'}
df_dataset['PseudomonasOtitis'] = df_dataset['PseudomonasOtitis'].replace(replacements)

In [12]:
# Create ids that can be fed into the model
def tick_label_ids(symbol):
    if symbol == '✓':
        return 0
    elif symbol == 'x':
        return 1

In [13]:
# Label dataset   
df_dataset['PseudomonasOtitis_ID'] = df_dataset['PseudomonasOtitis'].apply(tick_label_ids)

In [14]:
print(df_dataset.head())

                                           item_text  consult_record       pk  \
0  "<<identifier>> otitis externa been really goo...          231003  2040703   
1  "HL issues and ears. been slowing up last few ...          231674  2041630   
2  "bilateral oe thickened ear canals cleaning is...          232326  2042546   
3  "left eear purulent otitis suspecrt pseudomona...          233568  2044364   
4  "Bilat OE again. Not purulent or ulcerated, so...          237069  2049428   

        consult_record_date  savsnet_consult_id  Unnamed: 0  \
0 2014-06-30 17:09:38+00:00               95721           0   
1 2014-07-26 11:24:00+00:00              129032           1   
2 2014-08-29 16:22:44+00:00              169684           2   
3 2014-07-16 19:29:10+00:00              117477           3   
4 2014-07-15 17:17:57+00:00              115552           4   

             searchterm__name  narrative_item_id PseudomonasOtitis  \
0  Pseudomonas and ear/otitis            2040703                

In [15]:
print(df_dataset['PseudomonasOtitis'].value_counts())

PseudomonasOtitis
✓    638
x    256
Name: count, dtype: int64


In [16]:
# Select get text and labels
df_dataset = df_dataset[['savsnet_consult_id', 'PseudomonasOtitis_ID', 'item_text']]
# Rename columns
df_dataset = df_dataset.rename(columns={'PseudomonasOtitis_ID':'label', 'item_text':'text'})
df_dataset.reset_index(drop=True)
print(df_dataset.head())

   savsnet_consult_id  label  \
0               95721      1   
1              129032      0   
2              169684      0   
3              117477      0   
4              115552      1   

                                                text  
0  "<<identifier>> otitis externa been really goo...  
1  "HL issues and ears. been slowing up last few ...  
2  "bilateral oe thickened ear canals cleaning is...  
3  "left eear purulent otitis suspecrt pseudomona...  
4  "Bilat OE again. Not purulent or ulcerated, so...  


In [17]:
# Generate train/val/test split 
"""
Split Each Category into 80/10/10 train/val/test sets
"""
# for each label in the dataframe, randomly select the number of records in the group_size. Note each index selected and remove from test set
import random

# get labels
labels = df_dataset['label'].unique()

# Set up train_set, val_set and test_set
train_set = []
val_set = []
test_set = []

#iterate over labels
for label in labels:
    # Get indexes of labels in a given group
    indexes = df_dataset[df_dataset['label'] == label].index.to_list()
    
    # Get size of group
    train_size = round(len(indexes) * 0.8)
    val_test_size = round(len(indexes) * 0.1)
    
    # Randomly sample train_size indexes to make train_set. Remove these indexes 
    train_indexes = random.sample(indexes, train_size)
    train_set += train_indexes
    
    # Remove train_indexes from overall indexes
    indexes = list(set(indexes) - set(train_indexes))
    
    # Randomly sample val_test_size indexes to make val_set. Remove these indexes
    val_indexes = random.sample(indexes, val_test_size)
    val_set += val_indexes
    
    # Remove train_indexes from overall indexes
    indexes = list(set(indexes) - set(val_indexes))
    test_set += indexes

In [18]:
# Create train, val and test dataframes
df_train = df_dataset.iloc[train_set].reset_index(drop=True)
df_val = df_dataset.iloc[val_set].reset_index(drop=True)
df_test = df_dataset.iloc[test_set].reset_index(drop=True)

In [19]:
print(os.getcwd())

/opt/jupyterlab/notebooks/DogBERT/Classifiers/Pseudomonas_Otitis/Binary_Classifier/Binary_Weighted_Loss


In [20]:
df_train = df_train[['label', 'text']]
df_val = df_val[['label', 'text']]
df_test = df_test[['label', 'text']]

In [21]:
df_train.to_csv('train.csv')
df_val.to_csv('val.csv')
df_test.to_csv('test.csv')

In [22]:
datasets = load_dataset("csv", data_files={'train': "train.csv",
                                           'eval': "val.csv",
                                           'test':"test.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

## Tokenize Text

In [23]:
# Load BERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [24]:
def tokenize_and_encode(examples):
    return tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=512
    )

In [25]:
tokenized_datasets = datasets.map(
    tokenize_and_encode, batched=True)

Map:   0%|          | 0/715 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/89 [00:00<?, ? examples/s]

## Load BERT Model

In [35]:
# Label to Id and Id to labels
id2label = {0: "✓", 1: "x"}
label2id = {"✓": 0, "x": 1}

In [36]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
# Setup weights and biases stuff
os.environ["WANDB_PROJECT"]="Pseudomonas_Otitis_Classifier_Binary_Post_Regex"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

In [38]:
training_args = TrainingArguments(
    output_dir= "BERT_PSOE_Binary_Classifier_Filtered_Unweighted",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=30,
    weight_decay=0.01,
    save_strategy = "epoch",
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16 = True,
    run_name="BERT_Unweighted",
    save_total_limit=3
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [39]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

In [40]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

In [41]:
def compute_metrics(eval_preds):
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    metrics = {"accuracy": accuracy.compute(predictions=predictions, references=labels), "f1":f1.compute(predictions=predictions, references=labels)}
    return metrics

In [42]:
trainer = Trainer(
     model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.599267,{'accuracy': 0.7111111111111111},{'f1': 0.0}
2,No log,0.589191,{'accuracy': 0.7111111111111111},{'f1': 0.0}
3,No log,0.675549,{'accuracy': 0.7111111111111111},{'f1': 0.0}
4,No log,0.785293,{'accuracy': 0.7111111111111111},{'f1': 0.0}
5,No log,0.984635,{'accuracy': 0.7333333333333333},{'f1': 0.25}


[34m[1mwandb[0m: Adding directory to artifact (./BERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-30)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-60)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-90)... Done. 2.2s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-120)... Done. 2.9s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-150)... Done. 1.9s


In [43]:
from numba import cuda
import gc
torch.cuda.empty_cache()
gc.collect()

0

## BERT Weighted

In [44]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
# iterate over each column and calculate the weight based on max value in column
#def calculate_class_weights(train_dataset, label_cols):
labels = list(label2id.values())
total_samples = len(df_train)
label_lst = []
class_weights = []
for label in labels:
    class_frequency = df_train['label'].value_counts()[label]
    weight = total_samples / (2 * class_frequency)
    if weight < 1:
        weight = 1
    class_weights.append(weight)
    label_lst.append(label)
        
weights = torch.tensor(class_weights, dtype=torch.float32)
print(weights)
print(id2label)

tensor([1.0000, 1.7439])
{0: '✓', 1: 'x'}


In [46]:
training_args = TrainingArguments(
    output_dir= "BERT_PSOE_Binary_Classifier_Filtered_Weighted",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=30,
    weight_decay=0.01,
    save_strategy = "epoch",
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16 = True,
    run_name="BERT_Weighted",
    save_total_limit=3
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [49]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(
            weight=weights
        ).to("cuda")
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [50]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model()

  trainer = CustomTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.631818,{'accuracy': 0.7111111111111111},{'f1': 0.0}
2,No log,0.627796,{'accuracy': 0.6888888888888889},{'f1': 0.06666666666666667}
3,No log,0.668799,{'accuracy': 0.7111111111111111},{'f1': 0.0}
4,No log,0.748914,{'accuracy': 0.7111111111111111},{'f1': 0.13333333333333333}
5,No log,0.842267,{'accuracy': 0.7444444444444445},{'f1': 0.41025641025641024}


[34m[1mwandb[0m: Adding directory to artifact (./BERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-30)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-60)... Done. 2.4s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-90)... Done. 2.0s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-120)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-150)... Done. 2.4s


In [51]:
torch.cuda.empty_cache()
gc.collect()

1427

## DogBERT Unweighted

In [52]:
# Load Model
model_dir = "/opt/jupyterlab/notebooks/DogBERT/Domain Adaptation/DogBERT v0.0.2"
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /opt/jupyterlab/notebooks/DogBERT/Domain Adaptation/DogBERT v0.0.2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
training_args = TrainingArguments(
    output_dir= "DogBERT_PSOE_Binary_Classifier_Filtered_Unweighted",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy = "epoch",
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16 = True,
    run_name="DogBERT_Unweighted",
    save_total_limit=3
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [54]:
trainer = Trainer(
     model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.573224,{'accuracy': 0.7111111111111111},{'f1': 0.0}
2,No log,0.54059,{'accuracy': 0.7},{'f1': 0.0}
3,No log,0.492523,{'accuracy': 0.7888888888888889},{'f1': 0.5128205128205128}
4,No log,0.479793,{'accuracy': 0.7777777777777778},{'f1': 0.5652173913043478}
5,No log,0.858597,{'accuracy': 0.7666666666666667},{'f1': 0.43243243243243246}
6,No log,0.832256,{'accuracy': 0.7777777777777778},{'f1': 0.5238095238095238}
7,No log,0.908021,{'accuracy': 0.7666666666666667},{'f1': 0.5531914893617021}


[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-23)... Done. 2.0s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-46)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-69)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-92)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-115)... Done. 2.1s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-138)... Done. 2.1s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Unweighted/checkpoint-161)... Done. 2.1s


In [55]:
torch.cuda.empty_cache()
gc.collect()

570

## DogBERT Weighted

In [56]:
# Load Model
model_dir = "/opt/jupyterlab/notebooks/DogBERT/Domain Adaptation/DogBERT v0.0.2"
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /opt/jupyterlab/notebooks/DogBERT/Domain Adaptation/DogBERT v0.0.2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
training_args = TrainingArguments(
    output_dir= "DogBERT_PSOE_Binary_Classifier_Filtered_Weighted",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    weight_decay=0.01,
    save_strategy = "epoch",
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16 = True,
    run_name="DogBERT_Weighted",
    save_total_limit=3
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [58]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model()

  trainer = CustomTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.616304,{'accuracy': 0.6555555555555556},{'f1': 0.3404255319148936}
2,No log,0.535615,{'accuracy': 0.7222222222222222},{'f1': 0.24242424242424243}
3,No log,0.482646,{'accuracy': 0.7666666666666667},{'f1': 0.5116279069767442}
4,No log,0.499251,{'accuracy': 0.7888888888888889},{'f1': 0.5957446808510638}
5,No log,1.155488,{'accuracy': 0.7666666666666667},{'f1': 0.4}
6,No log,0.908467,{'accuracy': 0.8222222222222222},{'f1': 0.68}


[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-23)... Done. 2.1s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-46)... Done. 2.4s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-69)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-92)... Done. 2.4s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-115)... Done. 2.4s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-138)... Done. 3.3s


In [59]:
torch.cuda.empty_cache()
gc.collect()

1049

## PetBERT Unweighted

In [60]:
# Load PetBERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained('SAVSNET/PetBERT')

In [61]:
model = AutoModelForSequenceClassification.from_pretrained('SAVSNET/PetBERT', num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SAVSNET/PetBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
def tokenize_and_encode(examples):
    return tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=512
    )

In [63]:
tokenized_datasets = datasets.map(
    tokenize_and_encode, batched=True)

Map:   0%|          | 0/715 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/89 [00:00<?, ? examples/s]

In [64]:
training_args = TrainingArguments(
    output_dir= "PetBERT_PSOE_Binary_Classifier_Filtered_UnWeighted",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    save_strategy = "epoch",
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16 = True,
    run_name="PetBERT_UnWeighted",
    save_total_limit=3
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [65]:
trainer = Trainer(
     model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.578373,{'accuracy': 0.7111111111111111},{'f1': 0.0}
2,No log,0.567898,{'accuracy': 0.7},{'f1': 0.0}
3,No log,0.508651,{'accuracy': 0.7888888888888889},{'f1': 0.5365853658536586}
4,No log,0.469813,{'accuracy': 0.8222222222222222},{'f1': 0.6521739130434783}
5,No log,0.802042,{'accuracy': 0.8},{'f1': 0.5263157894736842}
6,No log,0.870804,{'accuracy': 0.8222222222222222},{'f1': 0.6190476190476191}
7,No log,1.192707,{'accuracy': 0.8222222222222222},{'f1': 0.6190476190476191}


[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_UnWeighted/checkpoint-23)... Done. 2.4s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_UnWeighted/checkpoint-46)... Done. 2.0s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_UnWeighted/checkpoint-69)... Done. 2.0s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_UnWeighted/checkpoint-92)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_UnWeighted/checkpoint-115)... Done. 2.0s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_UnWeighted/checkpoint-138)... Done. 2.5s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_UnWeighted/checkpoint-161)... Done. 2.1s


In [66]:
torch.cuda.empty_cache()
gc.collect()

570

## PetBERT Weighted

In [67]:
model = AutoModelForSequenceClassification.from_pretrained('SAVSNET/PetBERT', num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SAVSNET/PetBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
training_args = TrainingArguments(
    output_dir= "PetBERT_PSOE_Binary_Classifier_Filtered_Weighted",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    save_strategy = "epoch",
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16 = True,
    run_name="PetBERT_Weighted",
    save_total_limit=3
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [69]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model()

  trainer = CustomTrainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.641967,{'accuracy': 0.6555555555555556},{'f1': 0.41509433962264153}
2,No log,0.567443,{'accuracy': 0.7555555555555555},{'f1': 0.42105263157894735}
3,No log,0.501858,{'accuracy': 0.8111111111111111},{'f1': 0.6046511627906976}
4,No log,0.476276,{'accuracy': 0.8222222222222222},{'f1': 0.68}
5,No log,0.763989,{'accuracy': 0.8222222222222222},{'f1': 0.6}
6,No log,0.811655,{'accuracy': 0.8111111111111111},{'f1': 0.6046511627906976}
7,No log,1.531485,{'accuracy': 0.8111111111111111},{'f1': 0.5641025641025641}


[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-23)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-46)... Done. 2.6s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-69)... Done. 2.4s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-92)... Done. 2.8s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-115)... Done. 2.4s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-138)... Done. 2.2s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_PSOE_Binary_Classifier_Filtered_Weighted/checkpoint-161)... Done. 2.4s


## Inference

In [70]:
def predict_sentiment(text):
  inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
  outputs = model(**inputs)
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
  predicted_class = torch.argmax(predictions).item()
  confidence_score = predictions.squeeze()[predicted_class].item()
  return predicted_class, confidence_score

In [71]:
tokenizer = AutoTokenizer.from_pretrained("BERT_PSOE_Binary_Classifier_Filtered_Unweighted")
model = AutoModelForSequenceClassification.from_pretrained("BERT_PSOE_Binary_Classifier_Filtered_Unweighted")

In [72]:
# Assuming your dataframe is called 'df' and the text column is 'text'
df_test["BERT_unweighted_predicted_pseudomonas_otitis"], df_test["BERT_unweighted_confidence_score"] = zip(*df_test["text"].apply(predict_sentiment))

In [73]:
tokenizer = AutoTokenizer.from_pretrained("BERT_PSOE_Binary_Classifier_Filtered_Weighted")
model = AutoModelForSequenceClassification.from_pretrained("BERT_PSOE_Binary_Classifier_Filtered_Weighted")

In [74]:
# Assuming your dataframe is called 'df' and the text column is 'text'
df_test["BERT_weighted_predicted_pseudomonas_otitis"], df_test["BERT_weighted_confidence_score"] = zip(*df_test["text"].apply(predict_sentiment))

In [75]:
tokenizer = AutoTokenizer.from_pretrained("DogBERT_PSOE_Binary_Classifier_Filtered_Unweighted")
model = AutoModelForSequenceClassification.from_pretrained("DogBERT_PSOE_Binary_Classifier_Filtered_Unweighted")

In [76]:
# Assuming your dataframe is called 'df' and the text column is 'text'
df_test["DogBERT_unweighted_predicted_pseudomonas_otitis"], df_test["DogBERT_unweighted_confidence_score"] = zip(*df_test["text"].apply(predict_sentiment))

In [77]:
tokenizer = AutoTokenizer.from_pretrained("DogBERT_PSOE_Binary_Classifier_Filtered_Weighted")
model = AutoModelForSequenceClassification.from_pretrained("DogBERT_PSOE_Binary_Classifier_Filtered_Weighted")

In [78]:
# Assuming your dataframe is called 'df' and the text column is 'text'
df_test["DogBERT_weighted_predicted_pseudomonas_otitis"], df_test["DogBERT_weighted_confidence_score"] = zip(*df_test["text"].apply(predict_sentiment))

In [79]:
tokenizer = AutoTokenizer.from_pretrained("PetBERT_PSOE_Binary_Classifier_Filtered_UnWeighted")
model = AutoModelForSequenceClassification.from_pretrained("PetBERT_PSOE_Binary_Classifier_Filtered_UnWeighted")

In [80]:
# Assuming your dataframe is called 'df' and the text column is 'text'
df_test["PetBERT_unweighted_predicted_pseudomonas_otitis"], df_test["PetBERT_unweighted_confidence_score"] = zip(*df_test["text"].apply(predict_sentiment))

In [81]:
tokenizer = AutoTokenizer.from_pretrained("PetBERT_PSOE_Binary_Classifier_Filtered_Weighted")
model = AutoModelForSequenceClassification.from_pretrained("PetBERT_PSOE_Binary_Classifier_Filtered_Weighted")

In [82]:
# Assuming your dataframe is called 'df' and the text column is 'text'
df_test["PetBERT_weighted_predicted_pseudomonas_otitis"], df_test["PetBERT_weighted_confidence_score"] = zip(*df_test["text"].apply(predict_sentiment))

In [83]:
df_test.to_csv('infernece.csv', index=False)

In [84]:
def CreateMetricDict(preds, labels):
    """
    Function to create a dictionary of ML metrics from the output of a multilabel model

    Args: list of predictions, list of ground truth labels
    """
    metric_dict = {"accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"], 
                   "precision":precision.compute(predictions=predictions, references=labels)["precision"], 
                   "recall":recall.compute(predictions=predictions, references=labels)["recall"], 
                   "f1":f1.compute(predictions=predictions, references=labels)["f1"]}


    return metric_dict

In [85]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

predictions = list(df_test['BERT_unweighted_predicted_pseudomonas_otitis'])
labels = list(df_test['label'])

bert_uw_metrics = CreateMetricDict(predictions, labels)

predictions = list(df_test['BERT_weighted_predicted_pseudomonas_otitis'])
bert_w_metrics = CreateMetricDict(predictions, labels)

predictions = list(df_test['DogBERT_unweighted_predicted_pseudomonas_otitis'])
dogbert_uw_metrics = CreateMetricDict(predictions, labels)

predictions = list(df_test['DogBERT_weighted_predicted_pseudomonas_otitis'])
dogbert_w_metrics = CreateMetricDict(predictions, labels)

predictions = list(df_test['PetBERT_unweighted_predicted_pseudomonas_otitis'])
petbert_uw_metrics = CreateMetricDict(predictions, labels)

predictions = list(df_test['PetBERT_weighted_predicted_pseudomonas_otitis'])
petbert_w_metrics = CreateMetricDict(predictions, labels)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [86]:
dicts = [bert_uw_metrics, bert_w_metrics, dogbert_uw_metrics, dogbert_w_metrics, petbert_uw_metrics, petbert_w_metrics]

metrics_dict = {'model':['BERT (unweighted loss)', 'BERT (weighted loss)','DogBERT (unweighted loss)', 'DogBERT (weighted loss)','PetBERT (unweighted loss)', 'PetBERT (weighted loss)'], 'accuracy': [],'precision':[], 'recall':[], 'f1': []}


for model in dicts:
    for metric in model.keys():
        metrics_dict[metric].append(model[metric])

In [87]:
print(metrics_dict)

{'model': ['BERT (unweighted loss)', 'BERT (weighted loss)', 'DogBERT (unweighted loss)', 'DogBERT (weighted loss)', 'PetBERT (unweighted loss)', 'PetBERT (weighted loss)'], 'accuracy': [0.7191011235955056, 0.6629213483146067, 0.8539325842696629, 0.8089887640449438, 0.7640449438202247, 0.7528089887640449], 'precision': [0.0, 0.14285714285714285, 0.7307692307692307, 0.7222222222222222, 0.5714285714285714, 0.5483870967741935], 'recall': [0.0, 0.04, 0.76, 0.52, 0.64, 0.68], 'f1': [0.0, 0.0625, 0.7450980392156863, 0.6046511627906976, 0.6037735849056604, 0.6071428571428571]}


In [88]:
metrics_df = pd.DataFrame(metrics_dict)
metrics_df.to_csv('Pseudomonas_Otitis_Unbalanced_Binary_Evaluation_Metrics.csv', index=False)

In [2]:
metrics_df = pd.read_csv('Pseudomonas_Otitis_Unbalanced_Binary_Evaluation_Metrics.csv', index_col=False)

In [3]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(metrics_df)

                       model  accuracy  precision  recall        f1
0     BERT (unweighted loss)  0.719101   0.000000    0.00  0.000000
1       BERT (weighted loss)  0.662921   0.142857    0.04  0.062500
2  DogBERT (unweighted loss)  0.853933   0.730769    0.76  0.745098
3    DogBERT (weighted loss)  0.808989   0.722222    0.52  0.604651
4  PetBERT (unweighted loss)  0.764045   0.571429    0.64  0.603774
5    PetBERT (weighted loss)  0.752809   0.548387    0.68  0.607143


In [5]:
print(metrics_df.to_latex(index=False,
                    formatters={"name": str.upper},
                    float_format="{:.2f}".format))

\begin{tabular}{lrrrr}
\toprule
model & accuracy & precision & recall & f1 \\
\midrule
BERT (unweighted loss) & 0.72 & 0.00 & 0.00 & 0.00 \\
BERT (weighted loss) & 0.66 & 0.14 & 0.04 & 0.06 \\
DogBERT (unweighted loss) & 0.85 & 0.73 & 0.76 & 0.75 \\
DogBERT (weighted loss) & 0.81 & 0.72 & 0.52 & 0.60 \\
PetBERT (unweighted loss) & 0.76 & 0.57 & 0.64 & 0.60 \\
PetBERT (weighted loss) & 0.75 & 0.55 & 0.68 & 0.61 \\
\bottomrule
\end{tabular}



## Try scaling the weights

In [100]:
# iterate over each column and calculate the weight based on max value in column
#def calculate_class_weights(train_dataset, label_cols):
labels = list(label2id.values())
total_samples = len(df_train)
label_lst = []
class_weights = []
for label in labels:
    class_frequency = df_train['label'].value_counts()[label]
    weight = total_samples / (2 * class_frequency)
    class_weights.append(weight)
    label_lst.append(label)
        
# Scale weights based on minimum class weight
min_class_weight = min(class_weights)
if min_class_weight < 1:
    class_weights = [x/min_class_weight for x in class_weights]
weights = torch.tensor(class_weights, dtype=torch.float32)
print(weights)

tensor([1.0000, 2.4878])


## DogBERT Scaled Weighted

In [96]:
# Load Model
model_dir = "/opt/jupyterlab/notebooks/DogBERT/Domain Adaptation/DogBERT v0.0.2"
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /opt/jupyterlab/notebooks/DogBERT/Domain Adaptation/DogBERT v0.0.2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [97]:
# Load BERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [98]:
tokenized_datasets = datasets.map(
    tokenize_and_encode, batched=True)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [99]:
training_args = TrainingArguments(
    output_dir= "DogBERT_PSOE_Binary_Classifier_Filtered_Weighted_Scaled",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    weight_decay=0.01,
    save_strategy = "epoch",
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16 = True,
    run_name="DogBERT_Weighted_Scaled",
    save_total_limit=3
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [101]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(
            weight=weights
        ).to("cuda")
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [102]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.645493,{'accuracy': 0.6444444444444445},{'f1': 0.5294117647058824}
2,No log,0.547928,{'accuracy': 0.7888888888888889},{'f1': 0.6415094339622641}
3,No log,0.429582,{'accuracy': 0.8333333333333334},{'f1': 0.7169811320754716}
4,No log,0.470682,{'accuracy': 0.8111111111111111},{'f1': 0.711864406779661}
5,No log,0.608288,{'accuracy': 0.8111111111111111},{'f1': 0.6046511627906976}
6,No log,0.664527,{'accuracy': 0.8222222222222222},{'f1': 0.6666666666666666}


[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted_Scaled/checkpoint-23)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted_Scaled/checkpoint-46)... Done. 1.9s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted_Scaled/checkpoint-69)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted_Scaled/checkpoint-92)... Done. 2.2s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted_Scaled/checkpoint-115)... Done. 2.1s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_PSOE_Binary_Classifier_Filtered_Weighted_Scaled/checkpoint-138)... Done. 2.2s


In [103]:
tokenizer = AutoTokenizer.from_pretrained("DogBERT_PSOE_Binary_Classifier_Filtered_Weighted_Scaled")
model = AutoModelForSequenceClassification.from_pretrained("DogBERT_PSOE_Binary_Classifier_Filtered_Weighted_Scaled")

In [104]:
# Assuming your dataframe is called 'df' and the text column is 'text'
df_test["DogBERT_scaled_weighted_predicted_pseudomonas_otitis"], df_test["DogBERT__scaled_weighted_confidence_score"] = zip(*df_test["text"].apply(predict_sentiment))

In [106]:
df_test.to_csv('infernece.csv', index=False)

In [107]:
predictions = list(df_test['DogBERT_scaled_weighted_predicted_pseudomonas_otitis'])
labels = list(df_test['label'])

dogbert_sw_metrics = CreateMetricDict(predictions, labels)

In [108]:
print(dogbert_sw_metrics)

{'accuracy': 0.7865168539325843, 'precision': 0.6363636363636364, 'recall': 0.56, 'f1': 0.5957446808510638}


In [109]:
df_test_dogbert_weighted = df_test[['text', 'label', 'DogBERT_weighted_predicted_pseudomonas_otitis']]

In [117]:
# set the max columns to none
pd.set_option('display.max_columns', 3000)

In [118]:
print(df_test_dogbert_weighted)

                                                 text  label  \
0   "Axillae and interdigital skin look great. Not...      1   
1   "owner reports much better, no longer nibbling...      1   
2   "Aged dog, few conditions of concern: 1) bilat...      1   
3   "Re ex R ear. O reports run out of meds on sun...      1   
4   "REASON: poc, report results. Doing well, stil...      1   
..                                                ...    ...   
84  "Has recovered well from operation and good ap...      0   
85  "Ears improved - much less moist and less wax ...      0   
86  "SIJ Pseudomonas Otitis externa; Pododermatiti...      0   
87  "sudden onset smell from right ear, shaking he...      0   
88  "Bilateral severe OE, owner reports very letha...      0   

    DogBERT_weighted_predicted_pseudomonas_otitis  
0                                               1  
1                                               1  
2                                               0  
3                      

wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)
wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)
wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)
