**Notebook to perform categorical trait classification based on the DistilBERT transformer model.**

The datasets are first split into a training and test dataset and classification and the textual descriptions are encoded using a WordPiece tokenizer. This input is then used in the DistilBERT model with a sequence classification head.

# Libraries & Functions

In [1]:
'''Math & Data Libraries'''
import numpy as np
import pandas as pd

In [2]:
'''ML Libraries'''

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
'''DL Libraries'''
import torch
from datasets import Dataset, DatasetDict
from tokenizers import BertWordPieceTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizerFast, DistilBertTokenizerFast,  TrainingArguments, Trainer, AutoModelForSequenceClassification
from transformers import pipeline
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
from datasets import load_metric

In [4]:
''' Miscellaneous Libraries'''
from tqdm import tqdm

In [5]:
# Utilize GPU if available else CPU
if torch.cuda.is_available():  
    device = torch.device("cuda") 
    print('There are %d GPU(s) available.' % torch.cuda.device_count()) 
    print('We will use the GPU:', torch.cuda.get_device_name(0)) 
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060 Laptop GPU


In [23]:
email = "..." # Huggingface account email
username = "..." # Huggingface account username

In [24]:
!git config --global user.email email
!git config --global user.name username

# login into huggingface to be able to upload model to the huggingface model hub 
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (!"C:/Users/Admin/anaconda3/Library/mingw64/bin/git-credential-manager-core.exe",manager-core).
Your token has been saved to C:\Users\Admin\.huggingface\token
Login successful


In [8]:
model_name = "DistilBERT_PreTrained"
representation_name = "Embedding"

In [9]:
def calculate_scores(y_test, y_pred, average = "binary"):
    """
    Calculate the accuracy, precision, recall and F1-score of prediction. 
    ---
    Parameters
    ----------
    y_test : np.array
        true trait values of the test set descriptions. 
    y_pred : np.array
        predicted trait values for the test set descriptions. 
    average : one of "binary", "macro", "micro"
        how to average trait scores for the precision, recall and f1 scores. Default to "macro" for multi-class  
    Returns
    -------
    [accuracy, precision, recall, f1] : list
        List containing the accuracy, precision, recall and F1-score of prediction
    """
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    return [accuracy, precision, recall, f1]

In [10]:
def prepare_data(X, y):
  """
  Prepare data into required model input. 
  ---
  Parameters
  ----------
  X : np.array
      an array of textual descriptions. 
  y : np.array
      corresponding class values. 
  Returns
  -------
  df : pd.DataFrame
      pandas DataFrame containing the data where the descriptions are in the "text" column, the class values are in the "label" column 
      and an index "idx" representing the description index
  """
  data = []
  for i, (sequence, label) in enumerate(zip(X, y)):
    data.append(
        {"text":sequence,
        "label":label,
        "idx":i
        })
  df = pd.DataFrame(data)
  return Dataset.from_pandas(df)

def tokenize_function(example):
  """
  Function used to tokenize species' descriptions and further performs truncation on longer sequences to the maximum model size (512 for the DistilBERT model). 
  ---
  Parameters
  ----------
  example : str
      a textual description of a species. 
  Returns
  -------
  tokenized_example : dict()
      a dictionary consisting of the input_ids of the tokenized description, the token_type_ids which tell us how the input 
      is split into sentences, and the attention_mask which shows which tokens the model should focus on. 
      As we are not using padding in this part, but further on in the DataCollator, these are all set to 1 for now. 
  """
  return tokenizer(example["text"], truncation=True)

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # Instantiate WordPiece Tokenizer for DistilBERT
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Instantiate DataCollator with padding

In [11]:
trait_values_dict = {}
trait_values_dict["1.2.1"] = ["herb", "shrub", "tree"]
trait_values_dict["1.3.1"] = ["obligatory", "terrestrial"]
trait_values_dict["1.4.1"] = ["obligatory", "self-supporting"]
trait_values_dict["2.1.1"] = ["annual", "perennial"]
trait_values_dict["2.3.1"] = ["phanerophyte", "chamaephyte", "hemicryptophyte", "cryptophyte", "therophyte"]

trait_names_cat = ["Growth Form", "Epiphyte", "Climber", "Lifecycle", "Life Form"]
traits_cat = ["1.2.1", "1.3.1", "1.4.1", "2.1.1", "2.3.1"]

# Input Data

## Plants of the World Online (POWO) Dataset

In [12]:
df_POWO_GIFT = pd.read_excel("..//Data//Final Databases//POWO_GIFT.xlsx")

## Wikipedia Dataset

In [13]:
df_WIKI_GIFT = pd.read_excel("..//Data//Final Databases//WIKI_GIFT.xlsx")
df_WIKI_GIFT = df_WIKI_GIFT[~df_WIKI_GIFT["BERT_description"].isna()]

## POWO - Morphology General Habit

In [14]:
df_POWO_MGH_GIFT = pd.read_excel("..//Data//Final Databases//POWO_MGH_GIFT.xlsx").drop("Unnamed: 0", axis=1)

In [15]:
df_POWO_MGH_GIFT

Unnamed: 0,POWO_id,description,Growth Form,source,name,authors,i,ID,fqId,created,...,4.6.2_count,4.6.2,4.6.2_agreement,4.6.2_bias_by_reference,4.6.2_bias_by_derivation,4.7.2_count,4.7.2,4.7.2_agreement,4.7.2_bias_by_reference,4.7.2_bias_by_derivation
0,morphologyGeneralHabit,"&nbsp;Annual or short-lived perennial, erect t...",,Flora Zambesiaca Leguminosae subfamily Papilli...,Tephrosia longipes,Meisn.,151954,520704-1,urn:lsid:ipni.org:names:520704-1,,...,,,,,,,,,,
1,morphologyGeneralHabit,&nbsp;Erect to climbing shrub to 2 m high; bra...,Shrub,"Asparagaceae, Sebsebe Demissew. Flora of Tropi...",Asparagus scaberulus,A.Rich.,34764,531301-1,urn:lsid:ipni.org:names:531301-1,,...,,,,,,,,,,
2,morphologyGeneralHabit,(Annual? or) perennial herb with a stout verti...,Herb,"Amaranthaceae, C. C. Townsend. Flora Zambesiac...",Alternanthera nodiflora,R.Br.,7634,59266-1,urn:lsid:ipni.org:names:59266-1,,...,,,,,,,,,,
3,morphologyGeneralHabit,(Annual? or) perennial herb with a stout verti...,Herb,"Amaranthaceae, C.C. Townsend. Flora of Tropica...",Alternanthera nodiflora,R.Br.,7634,59266-1,urn:lsid:ipni.org:names:59266-1,,...,,,,,,,,,,
4,morphologyGeneralHabit,"? Perennial herb, probably with a trailing woo...",Herb,"Polygonaceae, R. A. Graham. Flora of Tropical ...",Oxygonum stuhlmannii,Dammer,273190,694850-1,urn:lsid:ipni.org:names:694850-1,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67972,morphologyGeneralHabit,"Woody-based perennial to slender shrub, 0.1–1....",,"M. Thulin. Flora of Somalia, Vol. 1–4 [updated...",Thesium hararensis,A.G.Mill.,313989,903247-1,urn:lsid:ipni.org:names:903247-1,,...,,,,,,,,,,
67973,morphologyGeneralHabit,Woody-based Thesium-like herb with several ere...,,"Rubiaceae, B. Verdcourt. Flora Zambesiaca 5:1....",Manostachya staelioides,(K.Schum.) Bremek.,302653,755812-1,urn:lsid:ipni.org:names:755812-1,,...,,,,,,,,,,
67974,morphologyGeneralHabit,"Woody-stemmed herb, 4 ft. high",,"Papilionaceae, Hutchinson and Dalziel. Flora o...",Indigofera megacephala,J.B.Gillett,143886,499640-1,urn:lsid:ipni.org:names:499640-1,,...,,,,,,,,,,
67975,morphologyGeneralHabit,Young branches and leaves rusty; leaves slight...,,"Apocynaceae, E.A. Omino. Flora of Tropical Eas...",Beaumontia grandiflora,Wall.,19449,77539-1,urn:lsid:ipni.org:names:77539-1,,...,1.0,15.0,,0.0,0.0,1.0,8.0,,0.0,0.0


# Text Representation - BERT

In [16]:
df_names = ["POWO", "WIKI", "POWO_MGH"]
df_list = [df_POWO_GIFT, df_WIKI_GIFT, df_POWO_MGH_GIFT]

corpus = dict()
for df_name, df in zip(df_names, df_list): 
    corpus[df_name] = df["BERT_description"].values

## Split Data

In [18]:
X_train = {}
X_test = {}

y_train = {}
y_test = {}

for df_name, df in zip(df_names, df_list): 
  for focus_name, focus_code in zip(trait_names_cat, traits_cat):
      
    trait_mask = df[focus_code].notna()
    y = {trait: 1*df[trait_mask][focus_code].apply(lambda x: trait == x) for trait in trait_values_dict[focus_code]}

    X_train[df_name, focus_name], X_test[df_name, focus_name], \
    indices_train, indices_test \
    = train_test_split(corpus[df_name][trait_mask],
                      np.arange(sum(trait_mask)), test_size=0.25, random_state=42) # We split the dataset to a training set of 75% and test set of 25%

    y_train[df_name, focus_name, focus_name] = np.zeros(len(X_train[df_name, focus_name]), dtype=int)
    y_test[df_name, focus_name, focus_name] = np.zeros(len(X_test[df_name, focus_name]), dtype=int)

    # On top of the original split we also create binary datasets for each trait value which we will use to evaluate class performance
    for i, trait_value in enumerate(trait_values_dict[focus_code]):
      y_train[df_name, focus_name, trait_value] = y[trait_value].values[indices_train]
      y_test[df_name, focus_name, trait_value] = y[trait_value].values[indices_test]

      y_train[df_name, focus_name, focus_name] += y_train[df_name, focus_name, trait_value] * i
      y_test[df_name, focus_name, focus_name] += y_test[df_name, focus_name, trait_value] * i

## DistilBERT Model Training

In [20]:
tmp_tmp_list = []
for df_name, df in zip(df_names, df_list): 
  for focus_name, focus_code in zip(trait_names_cat, traits_cat):
    print("Dataset:", df_name)
    print("\tTrait:", focus_name)

    # The data is first prepared to standardize columns 
    plant_dataset_finetune = DatasetDict() 
    plant_dataset_finetune["train"] = prepare_data(X_train[df_name, focus_name], y_train[df_name, focus_name, focus_name])
    plant_dataset_finetune["validation"] = prepare_data(X_test[df_name, focus_name], y_test[df_name, focus_name, focus_name])

    # The prepared data is then tokenized in batches
    tokenized_datasets = plant_dataset_finetune.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["text", "idx"])
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
    tokenized_datasets.set_format("torch")
    tokenized_datasets["train"].column_names

    # We then input the tokenized data into batches of size 16 (feel free to use batches larger than 8 if your GPU allows for it). We also use a data collator in this step to pad the data based on the batch maximum sequence length 
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator
    )
    
    model_finetune = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(set(y_train[df_name, focus_name, focus_name])))

    focus_name_model = focus_name.replace(" ", "_")
    
    # The model is trained on 3 epochs with a 16bit floating precision, a learning rate of 2e-5 and a weight decay of 0.01
    training_args = TrainingArguments(
        output_dir = f"DistilBERT-{df_name}_{focus_name_model}_Finetuned",
        learning_rate = 2e-5, 
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        num_train_epochs = 3,
        weight_decay = 0.01,
        overwrite_output_dir = True,
        evaluation_strategy = "epoch",
        push_to_hub = True,
        fp16 = True
    )
    
    # Model Fine-Tuning
    trainer = Trainer(
        model = model_finetune,
        args = training_args,
        train_dataset = tokenized_datasets["train"],
        eval_dataset = tokenized_datasets["validation"],
        tokenizer = tokenizer,
        data_collator = data_collator
    )
    trainer.train()

    # Push the fine-tuned LLM to huggingface model hub 
    trainer.push_to_hub()

## DistilBERT Model Evaluation

In [21]:
def evaluate_transformer(model_finetune, eval_dataloader, device, y_test, df_name_train, df_name_test, focus_name, focus_code):
  """
  Evaluate transformer performance. 
  ---
  Parameters
  ----------
  model_finetune : transformers.AutoModelForSequenceClassification()
      the transformer model fine-tuned on the categorical trait data 
  eval_dataloader : torch.utils.data.DataLoader()
      the dataloader for the test set
  device : "cuda" or "cpu"
      the device the model and data are on
  y_test: np.array
      the test set consisting of the true trait values
  df_name_train: "POWO" or "WIKI
      name of the training dataset 
  df_name_test: "POWO" or "WIKI
      name of the test dataset. This is different than df_name_train in the case where we use a inter-dataset evaluation approach
  focus_name: str
      the name of the trait of interest
  focus_code: str
      the GIFT code of the trait of interest 
  Returns
  -------
  res_list : list
      List containing model performance
  """
  res_list = []

  metric = load_metric("accuracy")

  logit_list = []
  prediction_list = []
  model_finetune.eval() # We set the model to evaluation mode

  for batch in eval_dataloader: # For each batch we make predictions of the trait value
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model_finetune(**batch)

      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)
      logit_list.append(logits.cpu().detach().numpy())
      prediction_list.append(predictions.cpu().detach().numpy())
      metric.add_batch(predictions=predictions, references=batch["labels"])

  print(metric.compute())
  y_predict = np.array([item for sublist in prediction_list for item in sublist])
 
  # We evaluate model performance using the true test data and predicted test data 
  acc_gift = accuracy_score(y_test, y_predict)
  prec_gift = precision_score(y_test, y_predict, average = "macro")
  rec_gift = recall_score(y_test, y_predict, average = "macro")
  f1_gift = f1_score(y_test, y_predict, average = "macro")

  results = [acc_gift, prec_gift, rec_gift, f1_gift]
  res_list.append([df_name_train, df_name_test, focus_name, focus_name] + results + [model_name, representation_name])

  # Class specific performance
  for i, trait_value in enumerate(trait_values_dict[focus_code]):
    print(trait_value, i)

    y_predict_class = np.zeros(y_predict.shape)
    y_predict_class[y_predict==i] = 1

    y_test_class = np.zeros(y_predict.shape)
    y_test_class[y_test==i] = 1

    acc_gift = accuracy_score(y_test_class, y_predict_class)
    prec_gift = precision_score(y_test_class, y_predict_class)
    rec_gift = recall_score(y_test_class, y_predict_class)
    f1_gift = f1_score(y_test_class, y_predict_class)

    results = [acc_gift, prec_gift, rec_gift, f1_gift]

    res_list.append([df_name_train, df_name_test, focus_name, trait_value] + results + [model_name, representation_name])

  return res_list

In [29]:
tmp_tmp_list = []
for focus_name, focus_code in zip(trait_names_cat, traits_cat):
  print("Trait:", focus_name)
  for df_name_train, df_train in zip(df_names[:], df_list[:]):
    print("\tTrain Dataset:", df_name_train)

    focus_name_model = focus_name.replace(" ", "_")
    checkpoint = f"{username}/DistilBERT-{df_name}_{focus_name_model}_Finetuned" 
    print(checkpoint)
    
    # Download model for corresponding dataset+trait combination
    model_finetune = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(trait_values_dict[focus_code]))
    model_finetune.to(device)

    for df_name_test, df_test in zip(df_names[:], df_list[:]):
      print("\t\tTest Dataset:", df_name_test)
      # Prepare dataset for evaluation
      plant_dataset_finetune = DatasetDict()
      plant_dataset_finetune["train"] = prepare_data(X_train[df_name_train, focus_name], y_train[df_name_train, focus_name, focus_name])
      plant_dataset_finetune["validation"] = prepare_data(X_test[df_name_test, focus_name], y_test[df_name_test, focus_name, focus_name])

      tokenized_datasets = plant_dataset_finetune.map(tokenize_function, batched=True)

      tokenized_datasets = tokenized_datasets.remove_columns(["text", "idx"])
      tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
      tokenized_datasets.set_format("torch")

      # Evaluate model performance
      eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator)
      tmp_list = evaluate_transformer(model_finetune, eval_dataloader, device, y_test[df_name_test, focus_name, focus_name], df_name_train, df_name_test, focus_name, focus_code)
      tmp_tmp_list.append(tmp_list)


In [None]:
from itertools import chain
unfolded = list(chain.from_iterable(i if isinstance(i, list) else [i] for i in tmp_tmp_list))
df_results = pd.DataFrame(unfolded, columns=["Train Dataset", "Test Dataset", "Trait", "Trait Value", "Accuracy", "Precision", "Recall", "F1-Score", "Model", "Representation"])
df_results

Unnamed: 0,Train Dataset,Test Dataset,Trait,Trait Value,Accuracy,Precision,Recall,F1-Score,Model,Representation
0,POWO,POWO,Life Form,Life Form,0.863364,0.834952,0.807776,0.819715,DistilBERT_PreTrained,Embedding
1,POWO,POWO,Life Form,phanerophyte,0.966542,0.945587,0.936268,0.940905,DistilBERT_PreTrained,Embedding
2,POWO,POWO,Life Form,chamaephyte,0.945607,0.700258,0.607623,0.65066,DistilBERT_PreTrained,Embedding
3,POWO,POWO,Life Form,hemicryptophyte,0.916636,0.845753,0.915398,0.879198,DistilBERT_PreTrained,Embedding
4,POWO,POWO,Life Form,cryptophyte,0.953645,0.829077,0.723842,0.772894,DistilBERT_PreTrained,Embedding
5,POWO,POWO,Life Form,therophyte,0.944299,0.854086,0.85575,0.854917,DistilBERT_PreTrained,Embedding
6,POWO,WIKI,Life Form,Life Form,0.64069,0.58389,0.550413,0.545734,DistilBERT_PreTrained,Embedding
7,POWO,WIKI,Life Form,phanerophyte,0.866975,0.910822,0.73514,0.813605,DistilBERT_PreTrained,Embedding
8,POWO,WIKI,Life Form,chamaephyte,0.910093,0.23913,0.124153,0.163447,DistilBERT_PreTrained,Embedding
9,POWO,WIKI,Life Form,hemicryptophyte,0.808368,0.428442,0.453935,0.44082,DistilBERT_PreTrained,Embedding


### Save Results

In [30]:
df_results.to_excel("..//Data//Results//DistilBERT_Results.xlsx", index=False)

## DistilBERT Model Probabilistic Evaluation 

In [32]:
from scipy.special import softmax, expit

def evaluate_transformer_proba(model_finetune, eval_dataloader, device, y_test):
  """
  Calculate transformer predicted probabilities. 
  ---
  Parameters
  ----------
  model_finetune : transformers.AutoModelForSequenceClassification()
      the transformer model fine-tuned on the categorical trait data 
  eval_dataloader : torch.utils.data.DataLoader()
      the dataloader for the test set
  device : "cuda" or "cpu"
      the device the model and data are on
  y_test: np.array
      the test set consisting of the true trait values
  Returns
  -------
  transformer_proba_dict : dict()
      dictionary containing "y_true" which contain the true labels of the test set and "y_pred_sigmoid" which contain the 
      predicted probabilities of the transformer upon using a sigmoid transformation to conver the logits into a 0-1 range 
  """
  tmp_list = []
  transformer_proba_dict = dict()

  metric = load_metric("accuracy")

  logit_list = []
  prediction_list = []
  model_finetune.eval()

  for batch in eval_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model_finetune(**batch)

      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)
      logit_list.append(logits.cpu().detach().numpy())
      prediction_list.append(predictions.cpu().detach().numpy())
      metric.add_batch(predictions=predictions, references=batch["labels"])

  print(metric.compute())
  y_predict = np.array([item for sublist in prediction_list for item in sublist])
  y_predict_logit = np.array([item for sublist in logit_list for item in sublist])
  y_predict_sigmoid = expit(y_predict_logit)
  
  transformer_proba_dict["y_pred_sigmoid"] = y_predict_sigmoid
  transformer_proba_dict["y_true"] = y_test

  return transformer_proba_dict

In [33]:
prediction_probabilities_dict = {}
for focus_name, focus_code in zip(trait_names_cat, traits_cat):
  print("Trait:", focus_name)
  for df_name, df in zip(df_names[:], df_list[:]):
    print("\tDataset:", df_name)

    focus_name_model = focus_name.replace(" ", "_")
    checkpoint = f"{username}/DistilBERT-{df_name}_{focus_name_model}_Finetuned"
    print(checkpoint)
    
    # Download model for corresponding dataset+trait combination
    model_finetune = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(trait_values_dict[focus_code]))
    model_finetune.to(device)

    plant_dataset_finetune = DatasetDict()
    plant_dataset_finetune["train"] = prepare_data(X_train[df_name, focus_name], y_train[df_name, focus_name, focus_name])
    plant_dataset_finetune["validation"] = prepare_data(X_test[df_name, focus_name], y_test[df_name, focus_name, focus_name])

    # Prepare dataset for evaluation
    tokenized_datasets = plant_dataset_finetune.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["text", "idx"])
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
    tokenized_datasets.set_format("torch")

    # Predict test set probabilities 
    eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator)
    prediction_probabilities_dict[df_name, df_name, focus_name] = evaluate_transformer_proba(model_finetune, eval_dataloader, device, y_test[df_name, focus_name, focus_name], df_name, df_name, focus_name, focus_code)

### Save Results

In [34]:
import pickle

f = open("..//Data//Results//Probabilistic_Predictions.pkl","wb")
pickle.dump(prediction_probabilities_dict,f)
f.close()