## Imports

In [3]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, load_metric
import datetime
import os
import time
import numpy as np
import evaluate
import accelerate
import torch

## Load Data

In [2]:
print(os.getcwd())

/opt/jupyterlab/notebooks/DogBERT/Classifiers/Pseudomonas_Otitis/Binary_Classifier/Balanced_Binary_Classifier


In [4]:
"""
Load Narrative Data
"""
os.chdir('../../../../../savsnet_resources/pickles')

In [5]:
# Load Narratives Dataframe
df_narratives = pd.read_pickle('narrative_pickle.pkl.gz', compression='gzip').drop_duplicates(subset='savsnet_consult_id', keep='first')

In [6]:
os.chdir('../../DogBERT/Classifiers/Pseudomonas_Otitis/Binary_Classifier/Balanced_Binary_Classifier')

In [7]:
"""
Load Classification Training Data
"""
df_class = pd.read_csv("adamwilliams-OtitisStudyPseudomonas_with_non_pseudomonas_records_(n_1398).csv")

In [8]:
print(df_class.head())

   Unnamed: 0            searchterm__name  savsnet_consult_id  \
0         0.0  Pseudomonas and ear/otitis               95721   
1         1.0  Pseudomonas and ear/otitis              129032   
2         2.0  Pseudomonas and ear/otitis              169684   
3         3.0  Pseudomonas and ear/otitis              117477   
4         4.0  Pseudomonas and ear/otitis              115552   

   narrative_item_id PseudomonasOtitis PseudomonasOtitis related_text species  
0          2040703.0                 ?                            NaN     dog  
1          2041630.0                 ✓                            NaN     dog  
2          2042546.0                 ✓                            NaN     dog  
3          2044364.0                 ✓                            NaN     dog  
4          2049428.0                 !                            NaN     dog  


In [9]:
# Filter for just dog records
df_ps_oe = df_class[df_class.species == 'dog']

In [10]:
print(df_ps_oe['PseudomonasOtitis'].value_counts())

PseudomonasOtitis
✓    638
x    413
?    143
!     74
⍉     39
Name: count, dtype: int64


In [12]:
print(413+143+74+39)


669


In [13]:
# Remove unclassified records
df_ps_oe = df_ps_oe.dropna(subset=['PseudomonasOtitis'])

"""
Create Dataset
"""
# Join in tick labels
df_dataset = pd.merge(df_narratives, df_ps_oe, on='savsnet_consult_id', how='inner')

In [14]:
print(max(df_dataset['consult_record_date']))

2024-01-23 09:05:39+00:00


In [15]:
print(df_dataset['PseudomonasOtitis'].value_counts())

PseudomonasOtitis
✓    638
x    413
?    143
!     74
⍉     39
Name: count, dtype: int64


In [16]:
print(638+413+143+74+39)

1307


In [17]:
# Change labels to make it a binary classifier
replacements = {'⍉':'x', '?':'x', '!':'x'}
df_dataset['PseudomonasOtitis'] = df_dataset['PseudomonasOtitis'].replace(replacements)

In [18]:
print(df_dataset['PseudomonasOtitis'].value_counts())

PseudomonasOtitis
x    669
✓    638
Name: count, dtype: int64


In [19]:
# Create ids that can be fed into the model
def tick_label_ids(symbol):
    if symbol == '✓':
        return 0
    elif symbol == 'x':
        return 1

In [20]:
# Label dataset   
df_dataset['PseudomonasOtitis_ID'] = df_dataset['PseudomonasOtitis'].apply(tick_label_ids)

In [21]:
print(df_dataset.head())

                                           item_text  consult_record       pk  \
0  "<<identifier>> otitis externa been really goo...          231003  2040703   
1  "HL issues and ears. been slowing up last few ...          231674  2041630   
2  "bilateral oe thickened ear canals cleaning is...          232326  2042546   
3  "left eear purulent otitis suspecrt pseudomona...          233568  2044364   
4  "Bilat OE again. Not purulent or ulcerated, so...          237069  2049428   

        consult_record_date  savsnet_consult_id  Unnamed: 0  \
0 2014-06-30 17:09:38+00:00               95721         0.0   
1 2014-07-26 11:24:00+00:00              129032         1.0   
2 2014-08-29 16:22:44+00:00              169684         2.0   
3 2014-07-16 19:29:10+00:00              117477         3.0   
4 2014-07-15 17:17:57+00:00              115552         4.0   

             searchterm__name  narrative_item_id PseudomonasOtitis  \
0  Pseudomonas and ear/otitis          2040703.0                

In [22]:
# Select get text and labels
df_dataset = df_dataset[['PseudomonasOtitis_ID', 'item_text']]

# Rename columns
df_dataset = df_dataset.rename(columns={'PseudomonasOtitis_ID':'label', 'item_text':'text'})
df_dataset.reset_index(drop=True)

Unnamed: 0,label,text
0,1,"""<<identifier>> otitis externa been really goo..."
1,0,"""HL issues and ears. been slowing up last few ..."
2,0,"""bilateral oe thickened ear canals cleaning is..."
3,0,"""left eear purulent otitis suspecrt pseudomona..."
4,1,"""Bilat OE again. Not purulent or ulcerated, so..."
...,...,...
1302,1,DUDE. No CSVD. O concerned about eating habits...
1303,1,"Recheck lameness LF. Was fine on carprieve, si..."
1304,1,"CE all fine, owner reports no issues, uses yum..."
1305,1,POC - 2 sutures removed. OR that <<brand name ...


In [23]:
# Create train/validate/test split
from sklearn.model_selection import train_test_split
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_dataset['text'], df_dataset['label'], test_size=0.1, random_state=42)
# Split training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) 

In [24]:
print(len(X_train), len(X_test), len(X_val))

1058 131 118


In [25]:
df_train = pd.DataFrame({'text':X_train, 'label':y_train})
df_val = pd.DataFrame({'text':X_val, 'label':y_val})
df_test = pd.DataFrame({'text':X_test, 'label':y_test})

In [26]:
print(os.getcwd())

/opt/jupyterlab/notebooks/DogBERT/Classifiers/Pseudomonas_Otitis/Binary_Classifier/Balanced_Binary_Classifier


In [28]:
df_train.to_csv('train.csv')
df_val.to_csv('val.csv')
df_test.to_csv('test.csv')

In [29]:
datasets = load_dataset("csv", data_files={'train': "train.csv",
                                           'eval': "val.csv",
                                           'test': "test.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [30]:
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 1058
    })
    eval: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 118
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 131
    })
})


## Tokenize Data

In [31]:
# Load BERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [32]:
def preprocess_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

# Preprocess training and testing data
tokenized_datasets = datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/1058 [00:00<?, ? examples/s]

Map:   0%|          | 0/118 [00:00<?, ? examples/s]

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

In [33]:
# Label to Id and Id to labels
id2label = {0: "✓", 1: "x"}
label2id = {"✓": 0, "x": 1}

## Train Binary BERT Classifier

In [34]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

In [36]:
# Setup weights and biases stuff
os.environ["WANDB_PROJECT"]="Pseudomonas_Otitis_Classifier"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

In [37]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

In [38]:
training_args = TrainingArguments(
    output_dir= "BERT_psoe_Classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy = "epoch",
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16 = True,
    run_name="BERT-base-uncased",
    save_total_limit=3
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [40]:
trainer = Trainer(
     model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback]
)
trainer.train()
trainer.save_model()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.448744
2,No log,0.592032
3,No log,0.783858
4,No log,0.774829


[34m[1mwandb[0m: Adding directory to artifact (./BERT_psoe_Classifier/checkpoint-45)... Done. 2.2s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_psoe_Classifier/checkpoint-90)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_psoe_Classifier/checkpoint-135)... Done. 2.0s
[34m[1mwandb[0m: Adding directory to artifact (./BERT_psoe_Classifier/checkpoint-180)... Done. 1.8s


In [41]:
# Clear GPUs
from numba import cuda
import gc
torch.cuda.empty_cache()
gc.collect()

0

## Create DogBERT Classifier

In [42]:
# Load DogBERT
model_dir = "/opt/jupyterlab/notebooks/DogBERT/Domain Adaptation/DogBERT v0.0.2"
model2 = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /opt/jupyterlab/notebooks/DogBERT/Domain Adaptation/DogBERT v0.0.2 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
# Setup weights and biases stuff
os.environ["WANDB_PROJECT"]="Pseudomonas_Otitis_Classifier"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

In [44]:
training_args = TrainingArguments(
    output_dir= "DogBERT_psoe_Classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy = "epoch",
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16 = True,
    run_name="DogBERT",
    save_total_limit=3
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [45]:
trainer = Trainer(
     model=model2,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback]
)
trainer.train()
trainer.save_model()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.383349
2,No log,0.172935
3,No log,0.163628
4,No log,0.434397
5,No log,0.615487
6,No log,1.019014


[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_psoe_Classifier/checkpoint-45)... Done. 2.0s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_psoe_Classifier/checkpoint-90)... Done. 2.2s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_psoe_Classifier/checkpoint-135)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_psoe_Classifier/checkpoint-180)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_psoe_Classifier/checkpoint-225)... Done. 1.8s
[34m[1mwandb[0m: Adding directory to artifact (./DogBERT_psoe_Classifier/checkpoint-270)... Done. 2.3s


In [46]:
# Clear GPUs
from numba import cuda
import gc
torch.cuda.empty_cache()
gc.collect()

570

## Create PetBERT Classifier

In [47]:
# Load PetBERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained('SAVSNET/PetBERT')

In [48]:
def preprocess_petbert_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

# Preprocess training and testing data
tokenized_datasets = datasets.map(preprocess_petbert_function, batched=True)

Map:   0%|          | 0/1058 [00:00<?, ? examples/s]

Map:   0%|          | 0/118 [00:00<?, ? examples/s]

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

In [49]:
model3 = AutoModelForSequenceClassification.from_pretrained('SAVSNET/PetBERT', num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at SAVSNET/PetBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

In [51]:
# Setup weights and biases stuff
os.environ["WANDB_PROJECT"]="Pseudomonas_Otitis_Classifier"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

In [52]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model3 = model3.to(device)

In [53]:
training_args = TrainingArguments(
    output_dir= "PetBERT_psoe_Classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy = "epoch",
    eval_strategy = "epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16 = True,
    run_name="PetBERT",
    save_total_limit=3
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [54]:
trainer = Trainer(
     model=model3,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback]
)
trainer.train()
trainer.save_model()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.357286
2,No log,0.220183
3,No log,0.233385
4,No log,0.348846
5,No log,0.426467


[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_psoe_Classifier/checkpoint-45)... Done. 1.9s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_psoe_Classifier/checkpoint-90)... Done. 2.1s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_psoe_Classifier/checkpoint-135)... Done. 2.3s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_psoe_Classifier/checkpoint-180)... Done. 2.2s
[34m[1mwandb[0m: Adding directory to artifact (./PetBERT_psoe_Classifier/checkpoint-225)... Done. 2.2s


## BERT Inference

In [55]:
tokenizer = AutoTokenizer.from_pretrained("BERT_psoe_Classifier")
model = AutoModelForSequenceClassification.from_pretrained("BERT_psoe_Classifier")

def predict_sentiment(text):
  inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
  outputs = model(**inputs)
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
  predicted_class = torch.argmax(predictions).item()
  confidence_score = predictions.squeeze()[predicted_class].item()
  return predicted_class, confidence_score

In [56]:
# Assuming your dataframe is called 'df' and the text column is 'text'
df_test["predicted_pseudomonas_otitis"], df_test["confidence_score"] = zip(*df_test["text"].apply(predict_sentiment))

In [57]:
print(df_test.head())

                                                   text  label  \
1164  Skin generally fairly good. Some crusting in a...      1   
1047                                                 ""      1   
793   "results show pseudomonas, ear still very pain...      0   
1057  "Booster. BAR, DUDE normal. Decllined KC, worm...      1   
722   "Consult - repeat pseudomonas swab. Owner been...      1   

      predicted_pseudomonas_otitis  confidence_score  
1164                             1          0.992300  
1047                             1          0.989301  
793                              0          0.932000  
1057                             1          0.991960  
722                              0          0.781996  


In [58]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [59]:
bert_metrics = clf_metrics.compute(predictions=list(df_test['predicted_pseudomonas_otitis']), references=list(df_test['label']))

In [60]:
print(bert_metrics)

{'accuracy': 0.8091603053435115, 'f1': 0.8466257668711656, 'precision': 0.8117647058823529, 'recall': 0.8846153846153846}


In [61]:
print(os.getcwd())

/opt/jupyterlab/notebooks/DogBERT/Classifiers/Pseudomonas_Otitis/Binary_Classifier/Balanced_Binary_Classifier


In [62]:
# Clear GPUs
from numba import cuda
import gc
torch.cuda.empty_cache()
gc.collect()

970

## Do Inference With DogBERT Classifier

In [63]:
tokenizer = AutoTokenizer.from_pretrained("DogBERT_psoe_Classifier")
model = AutoModelForSequenceClassification.from_pretrained("DogBERT_psoe_Classifier")

In [64]:
# Assuming your dataframe is called 'df' and the text column is 'text'
df_test["predicted_pseudomonas_otitis_dogbert"], df_test["confidence_score_dogbert"] = zip(*df_test["text"].apply(predict_sentiment))

In [65]:
dogbert_metrics = clf_metrics.compute(predictions=list(df_test['predicted_pseudomonas_otitis_dogbert']), references=list(df_test['label']))

In [66]:
print(dogbert_metrics)

{'accuracy': 0.8931297709923665, 'f1': 0.9078947368421053, 'precision': 0.9324324324324325, 'recall': 0.8846153846153846}


In [67]:
# Clear GPUs
from numba import cuda
import gc
torch.cuda.empty_cache()
gc.collect()

132

## Do Inference With PetBERT

In [68]:
tokenizer = AutoTokenizer.from_pretrained("PetBERT_psoe_Classifier")
model = AutoModelForSequenceClassification.from_pretrained("PetBERT_psoe_Classifier")

In [69]:
# Assuming your dataframe is called 'df' and the text column is 'text'
df_test["predicted_pseudomonas_otitis_petbert"], df_test["confidence_score_petbert"] = zip(*df_test["text"].apply(predict_sentiment))

In [70]:
petbert_metrics = clf_metrics.compute(predictions=list(df_test['predicted_pseudomonas_otitis_petbert']), references=list(df_test['label']))

In [71]:
print(petbert_metrics)

{'accuracy': 0.8625954198473282, 'f1': 0.8783783783783784, 'precision': 0.9285714285714286, 'recall': 0.8333333333333334}


In [72]:
# Clear GPUs
from numba import cuda
import gc
torch.cuda.empty_cache()
gc.collect()

132

## Put Metrics in DataFrame

In [73]:
dicts = [bert_metrics, dogbert_metrics, petbert_metrics]

metrics_dict = {'model':['BERT', 'DogBERT', 'PetBERT'], 'accuracy': [],'precision':[], 'recall':[], 'f1': []}


for model in dicts:
    print(model)
    for metric in model.keys():
        print(metric)
        metrics_dict[metric].append(model[metric])

{'accuracy': 0.8091603053435115, 'f1': 0.8466257668711656, 'precision': 0.8117647058823529, 'recall': 0.8846153846153846}
accuracy
f1
precision
recall
{'accuracy': 0.8931297709923665, 'f1': 0.9078947368421053, 'precision': 0.9324324324324325, 'recall': 0.8846153846153846}
accuracy
f1
precision
recall
{'accuracy': 0.8625954198473282, 'f1': 0.8783783783783784, 'precision': 0.9285714285714286, 'recall': 0.8333333333333334}
accuracy
f1
precision
recall


In [74]:
print(metrics_dict)

{'model': ['BERT', 'DogBERT', 'PetBERT'], 'accuracy': [0.8091603053435115, 0.8931297709923665, 0.8625954198473282], 'precision': [0.8117647058823529, 0.9324324324324325, 0.9285714285714286], 'recall': [0.8846153846153846, 0.8846153846153846, 0.8333333333333334], 'f1': [0.8466257668711656, 0.9078947368421053, 0.8783783783783784]}


In [75]:
metrics_df = pd.DataFrame(metrics_dict)

In [115]:
print(os.getcwd())

/opt/jupyterlab/notebooks/DogBERT/Classifiers/Pseudomonas_Otitis/All_Records


In [76]:
.to_csv('Pseudomonas_Otitis_Evaluation_Metrics.csv', index=False)

In [124]:
print((petbert_metrics['f1'] - bert_metrics['f1'])*100)

14.410967639088156


In [77]:
# Clear GPUs
from numba import cuda
import gc
torch.cuda.empty_cache()
gc.collect()

0

## Create Latex Table

In [4]:
metrics_df = pd.read_csv('Pseudomonas_Otitis_Evaluation_Metrics.csv', index_col=False)

In [6]:
print(metrics_df.to_latex(index=False,
                  formatters={"name": str.upper},
                  float_format="{:.2f}".format,))

\begin{tabular}{lrrrr}
\toprule
model & accuracy & precision & recall & f1 \\
\midrule
BERT & 0.81 & 0.81 & 0.88 & 0.85 \\
DogBERT & 0.89 & 0.93 & 0.88 & 0.91 \\
PetBERT & 0.86 & 0.93 & 0.83 & 0.88 \\
\bottomrule
\end{tabular}

