# Libraries & Functions

In [1]:
'''Math & Data Libraries'''
import numpy as np
import pandas as pd

In [2]:
'''ML Libraries'''
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.special import softmax, expit

In [3]:
'''DL Libraries'''
import torch
from datasets import Dataset, DatasetDict
from tokenizers import BertWordPieceTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizerFast, DistilBertTokenizerFast,  TrainingArguments, Trainer, AutoModelForSequenceClassification
from transformers import pipeline

In [4]:
''' Miscellaneous Libraries'''
from tqdm import tqdm

In [5]:
if torch.cuda.is_available():  # Tell PyTorch to use the GPU. 
    device = torch.device("cuda") 
    print('There are %d GPU(s) available.' % torch.cuda.device_count()) 
    print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060 Laptop GPU


In [6]:
def calculate_scores(y_test, y_pred, average = "binary"):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    return [accuracy, precision, recall, f1]

In [7]:
from torch.utils.data import DataLoader
import torch
from transformers import AdamW, get_scheduler
from datasets import load_metric
from tqdm.auto import tqdm

def prepare_data(X, y):
  data = []
  for i, (sequence, label) in enumerate(zip(X, y)):
    data.append(
        {"text":sequence,
        "label":label,
        "idx":i
        })
  df = pd.DataFrame(data)
  return Dataset.from_pandas(df)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
def make_categorical_predictions(df_sample, description_column = "description", model_train = "DistilBERT", dataset_train = "POWO", activation_function = "expit", traits_of_interest = ["Growth Form"]):
    for focus_name, focus_code in zip(trait_names_cat, traits_cat):
        if(focus_name not in traits_of_interest):
            break
        print("Trait:", focus_name)

        focus_name_model = focus_name.replace(" ", "_")
        checkpoint = f"ViktorDo/{model_train}-{dataset_train}_{focus_name_model}_Finetuned"
        print(checkpoint)

        model_finetune = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(trait_values_dict[focus_code]))
        model_finetune.to(device)

        X_test = df_sample[description_column].values
        y_test = np.zeros(X_test.shape[0], dtype = int)

        plant_dataset_finetune = DatasetDict()
        plant_dataset_finetune["validation"] = prepare_data(X_test, y_test)

        tokenized_datasets = plant_dataset_finetune.map(tokenize_function, batched=True)

        tokenized_datasets = tokenized_datasets.remove_columns(["text", "idx"])
        tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
        tokenized_datasets.set_format("torch")

        eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator)

        logit_list = []
        prediction_list = []
        model_finetune.eval()

        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model_finetune(**batch)

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            logit_list.append(logits.cpu().detach().numpy())
            prediction_list.append(predictions.cpu().detach().numpy())

        y_predict = np.array([item for sublist in prediction_list for item in sublist])
        y_predict_logit = np.array([item for sublist in logit_list for item in sublist])

        trait_conversion_dict = {}
        for i, trait_val in enumerate(trait_values_dict[focus_code]):
            trait_conversion_dict[i] = trait_val

        y_predict_name = np.array([trait_conversion_dict[y_p] for y_p in y_predict])
        df_sample["Prediction_"+focus_name] = y_predict_name
        if(activation_function == "expit"):
            df_sample[[focus_name + "_" + trait_value + "_predict_proba" for trait_value in trait_values_dict[focus_code]]] = expit(y_predict_logit)
        elif(activation_function == "softmax"):
            df_sample[[focus_name + "_" + trait_value + "_predict_proba" for trait_value in trait_values_dict[focus_code]]] = softmax(y_predict_logit, axis = 1)
        else:
            print("No such activation function, output probabilities not returned")
    return df_sample

In [26]:
trait_values_dict = {}
trait_values_dict["1.2.1"] = ["herb", "shrub", "tree"]
trait_values_dict["1.3.1"] = ["obligatory", "terrestrial"]
trait_values_dict["1.4.1"] = ["obligatory", "self-supporting"]
trait_values_dict["2.1.1"] = ["annual", "perennial"]
trait_values_dict["2.3.1"] = ["phanerophyte", "chamaephyte", "hemicryptophyte", "cryptophyte", "therophyte"]

trait_names_cat = ["Growth_form_1", "Epiphyte_1", "Climber_1", "Lifecycle_1", "Life_form_1"]
traits_cat = ["1.2.1", "1.3.1", "1.4.1", "2.1.1", "2.3.1"]

trait_names_num = ["Plant Height Max", "Leaf Length Max", "Leaf Width Max"]
traits_num = ["1.6.2", "4.6.2", "4.7.2"]

# Input Sample Data

In [21]:
df_sample = pd.read_excel("../Data/Sample Usage/Azores_Small_Dataset.xlsx")

In [11]:
df_sample

Unnamed: 0,#,family_name,species_name,species_name_sur,synonym,description
0,494,Asteraceae,Leontodon rigens (Dryand. in Ait.) Paiva & Orm...,São Miguel-Löwenzahn / São Miguel Hawkbit / Li...,Syn.: Microderis rigens (Dryand. in Ait.) DC.,"20-50 cm. St. erect, much-branched above, wood..."
1,558,Juncaceae,Luzula multiflora (Retz.) Lej.,Vielblütige Hainsimse / Heath Wood-rush / Luzu...,,"20-40 cm. Caespitose herb with erect, cylindri..."
2,241,Euphorbiaceae,Euphorbia serpens Kunth,Schlangen-Wolfsmilch / Matted Sandmat / Euphor...,Syn.: Chamaesyce serpens (Kunth) Small,"Up to 20 cm long. St. prostrate, creeping and ..."
3,434,Dipsacaceae,Scabiosa nitens Roem. & Schult. [Escabiosa-dos...,Azoren-Skabiose /Azores Scabious /Scabieuse de...,,"20-40 cm. St. erect, glabrous, woody at base. ..."
4,501,Nymphaeaceae,"Nymphaea alba L. [Golfão-branco, Boleira-branc...","Weisse Teichrose/ European white water lily, W...",,"Up to 2.5 m. Aquatic, rhizomatous herb. Lvs. o..."
5,294,Onagraceae,Fuchsia magellanica Lam. [Brincos-de-princesa],"Magellans Fuchsie / Hummingbird Fuchsia, Hardy...",,"Up to 4 m. Shrub with slender, ± scrambling st..."
6,135,Papaveraceae,"Papaver rhoeas L. [Papoula-vermelha, Papoula-o...","Klatsch-Mohn / Common Poppy, Corn Poppy / Coqu...",,"20-60 cm. St. erect, ± branched, with white la..."
7,490,Asteraceae,"Lactuca watsoniana Trel. [Alfacinha, Alface-do...","Azoren-Lattich, Watsons Lattich/ Watson's Lett...",,"Up to 2 m. St. erect, woody at base, much-bran..."
8,333,Oleaceae,Picconia azorica (Tutin) Knobl. [Pau-branco],Azoren-Picconie/ Azores White Wood/Picconia de...,,"Up to 8 m. Evergreen tree with smooth, pale ba..."
9,138,Tropaeolaceae,"Tropaeolum majus L. [Chagas, Chagueira, Mastru...","Grosse Kapuzinerkresse / Garden Nasturtium, In...",,30-300 cm. Fleshy vine with twining petioles. ...


# Categorical Trait Predictions

In [12]:
traits_of_interest = ["Growth Form", "Epiphyte"]
dataset_train = "POWO" # Dataset used in training the model
model_train = "DistilBERT" # Model used in training the model
description_column = "description" # Name of column containing text
activation_function = "softmax" # Nonlinear transformation used on the output logits. Use expit (sigmoid) to allow for multiple activations (multiple trait values at once) such as in Growth Form, or softmax to make all predicted probabiltiies sum up to one

df_sample = make_categorical_predictions(df_sample, description_column = description_column, model_train = model_train, dataset_train = dataset_train, activation_function = activation_function, traits_of_interest = traits_of_interest)

In [13]:
df_sample

Unnamed: 0,#,family_name,species_name,species_name_sur,synonym,description
0,494,Asteraceae,Leontodon rigens (Dryand. in Ait.) Paiva & Orm...,São Miguel-Löwenzahn / São Miguel Hawkbit / Li...,Syn.: Microderis rigens (Dryand. in Ait.) DC.,"20-50 cm. St. erect, much-branched above, wood..."
1,558,Juncaceae,Luzula multiflora (Retz.) Lej.,Vielblütige Hainsimse / Heath Wood-rush / Luzu...,,"20-40 cm. Caespitose herb with erect, cylindri..."
2,241,Euphorbiaceae,Euphorbia serpens Kunth,Schlangen-Wolfsmilch / Matted Sandmat / Euphor...,Syn.: Chamaesyce serpens (Kunth) Small,"Up to 20 cm long. St. prostrate, creeping and ..."
3,434,Dipsacaceae,Scabiosa nitens Roem. & Schult. [Escabiosa-dos...,Azoren-Skabiose /Azores Scabious /Scabieuse de...,,"20-40 cm. St. erect, glabrous, woody at base. ..."
4,501,Nymphaeaceae,"Nymphaea alba L. [Golfão-branco, Boleira-branc...","Weisse Teichrose/ European white water lily, W...",,"Up to 2.5 m. Aquatic, rhizomatous herb. Lvs. o..."
5,294,Onagraceae,Fuchsia magellanica Lam. [Brincos-de-princesa],"Magellans Fuchsie / Hummingbird Fuchsia, Hardy...",,"Up to 4 m. Shrub with slender, ± scrambling st..."
6,135,Papaveraceae,"Papaver rhoeas L. [Papoula-vermelha, Papoula-o...","Klatsch-Mohn / Common Poppy, Corn Poppy / Coqu...",,"20-60 cm. St. erect, ± branched, with white la..."
7,490,Asteraceae,"Lactuca watsoniana Trel. [Alfacinha, Alface-do...","Azoren-Lattich, Watsons Lattich/ Watson's Lett...",,"Up to 2 m. St. erect, woody at base, much-bran..."
8,333,Oleaceae,Picconia azorica (Tutin) Knobl. [Pau-branco],Azoren-Picconie/ Azores White Wood/Picconia de...,,"Up to 8 m. Evergreen tree with smooth, pale ba..."
9,138,Tropaeolaceae,"Tropaeolum majus L. [Chagas, Chagueira, Mastru...","Grosse Kapuzinerkresse / Garden Nasturtium, In...",,30-300 cm. Fleshy vine with twining petioles. ...


# Numerical Trait Predictions

In [14]:
import re, string

def post_process_answer_height(answer):
    available_units = ('mm', 'cm', 'm', 'km', 'inches', 'ft', 'yds', 'miles')
    conversions = (1, 10, 1000, 1e6, 25.4, 304.8, 914.4, 1.609344e6)
    conversion_dict = {unit:rate for unit, rate in zip(available_units, conversions)}
    
    flag = 0
    answer_punc = answer.translate(str.maketrans('', '', string.punctuation))
    for unit in available_units:
        if(unit in answer_punc.split(" ")):
            flag = 1
            metric = unit
            break
            
    if(flag==0):
        return "No metric"
    result = []

    answer = re.sub("\(.*?\)","",answer)

    answer = answer.replace("-", " ") 
    counter = 0
    for part in answer.split(" "):
        if(counter>2):
            return "Too many numbers"
        if(is_float(part)):
            tmp = str(np.round(float(part) * conversion_dict[metric]/1000, 4))
            counter += 1
            result.append(tmp)
    return " ".join(result)


def post_process_answer_leaf_length(answer):
    available_units = ('mm', 'cm', 'm', 'km', 'inches', 'ft', 'yds', 'miles')
    conversions = (1, 10, 1000, 1e6, 25.4, 304.8, 914.4, 1.609344e6)
    conversion_dict = {unit:rate for unit, rate in zip(available_units, conversions)}
    
    flag = 0
    answer_punc = answer.translate(str.maketrans('', '', string.punctuation))
    for unit in available_units:
        if(unit in answer_punc.split(" ")):
            flag = 1
            metric = unit
            
    if(flag==0):
        return "No metric"
    result = []

    if("x" in answer):
        answer = answer.split("x")[0]
    
    answer = re.sub("\(.*?\)","",answer)

    answer = answer.replace("-", " ") 
    counter = 0
    for part in answer.split(" "):
        if(counter>2):
            return "Too many numbers"
        if(is_float(part)):
            tmp = str(np.round(float(part) * conversion_dict[metric]/10, 4))
            counter += 1
            result.append(tmp)
    return " ".join(result)

def post_process_answer_leaf_width(answer):
    available_units = ('mm', 'cm', 'm', 'km', 'inches', 'ft', 'yds', 'miles')
    conversions = (1, 10, 1000, 1e6, 25.4, 304.8, 914.4, 1.609344e6)
    conversion_dict = {unit:rate for unit, rate in zip(available_units, conversions)}
    
    flag = 0
    answer_punc = answer.translate(str.maketrans('', '', string.punctuation))
    for unit in available_units:
        if(unit in answer_punc.split(" ")):
            flag = 1
            metric = unit
            
    if(flag==0):
        return "No metric"
    result = []

    if("x" in answer):
        answer = answer.split("x")[1]
    
    answer = re.sub("\(.*?\)","",answer)

    answer = answer.replace("-", " ") 
    counter = 0
    for part in answer.split(" "):
        if(counter>2):
            return "Too many numbers"
        if(is_float(part)):
            tmp = str(np.round(float(part) * conversion_dict[metric]/10, 4))
            counter += 1
            result.append(tmp)
    return " ".join(result)

def post_post_process_answer(answer):
    answer_parts = answer.split(" ")
    if(len(answer_parts)==1 and is_float(answer_parts[0])):
        return float(answer_parts[0])
    if(len(answer_parts)==2 and is_float(answer_parts[0]) and is_float(answer_parts[1])):
        return float(answer_parts[1])
    return -1

In [15]:
def is_float(element):
    try:
        float(element)
        return True
    except ValueError:
        return False

In [16]:
def QA_Prediction(Questions, Description, model_pipeline):
    if not isinstance(Description, str):
        return "", "No Description", 0 
    
    answer_list = []
    score_list = []
    if(not any(map(str.isdigit, Description))):
        return "", "No Number", 0
    
    for q_i, question in enumerate(Questions):
        QA_input = {
        'question': question,
        'context': Description
        }
        res = model_pipeline(QA_input)
        answer = res["answer"]
        score = np.round(res["score"], 3)# if contain_check else 0
        answer_list.append(answer)
        score_list.append(score)
        
    best_answer_i = np.argmax(score_list)
    best_question = Questions[best_answer_i]
    best_answer = answer_list[best_answer_i]
    best_score = score_list[best_answer_i]
    
    return best_question, best_answer, best_score

In [25]:
Questions = {}
Questions["Plant Height Max"] = ["How tall is the plant?", "What is the height?"]
Questions["Leaf Length Max"] = ["How long is the leaf?", "What is the leaf length?"]
Questions["Leaf Width Max"] = ["How wide is the leaf?", "What is the leaf width?"]

def make_numerical_predictions(df_sample, description_column = "description", model_train = "DistilBERT", traits_of_interest = ["Plant Height Max"]):
    question_list = []
    answer_list = []
    score_list = []

    for focus_name, focus_code in zip(trait_names_num, traits_num):
        if(focus_name not in traits_of_interest):
            break
        print("Trait:", focus_name)

        if(model_train == "DistilBERT"):
            nlp = pipeline('question-answering', model = "distilbert-base-cased-distilled-squad", tokenizer = "distilbert-base-cased-distilled-squad")
        elif(model_train == "ROBERTA"):
            nlp = pipeline('question-answering', model = "deepset/roberta-base-squad2", tokenizer = "deepset/roberta-base-squad2")
        else:
            print("No Such Model, Predictions not made")
            break
        
        for i, description in tqdm(enumerate(df_sample[description_column].values)):
            ques, ans, score = QA_Prediction(Questions[focus_name], description, nlp)
            question_list.append(ques)
            answer_list.append(ans)
            score_list.append(score)
        if(focus_name == "Plant Height Max"):
            post_predictions = [post_process_answer_height(ans) for ans in answer_list]
        if(focus_name == "Leaf Length Max"):
            post_predictions = [post_process_answer_leaf_length(ans) for ans in answer_list]
        if(focus_name == "Leaf Width Max"):
            post_predictions = [post_process_answer_leaf_width(ans) for ans in answer_list]
        post_post_predictions = np.array([post_post_process_answer(ans) for ans in post_predictions])

        for var, data in zip(["Questions", "Answers", "Scores", "Predictions"], [question_list, answer_list, score_list, post_post_predictions]):
            df_sample.loc[:, focus_name + "_" + var] = data
    return df_sample

In [27]:
traits_of_interest = ["Plant Height Max"]
model_train = "DistilBERT" # Model used in training the model
description_column = "description" # Name of column containing text

df_sample = make_numerical_predictions(df_sample, description_column = description_column, model_train = model_train, traits_of_interest = traits_of_interest)

Trait: Plant Height Max


0it [00:00, ?it/s]

In [28]:
df_sample

Unnamed: 0,#,family_name,species_name,species_name_sur,synonym,description,Plant Height Max_Questions,Plant Height Max_Answers,Plant Height Max_Scores,Plant Height Max_Predictions
0,494,Asteraceae,Leontodon rigens (Dryand. in Ait.) Paiva & Orm...,São Miguel-Löwenzahn / São Miguel Hawkbit / Li...,Syn.: Microderis rigens (Dryand. in Ait.) DC.,"20-50 cm. St. erect, much-branched above, wood...",What is the height?,20-50 cm,0.802,0.5
1,558,Juncaceae,Luzula multiflora (Retz.) Lej.,Vielblütige Hainsimse / Heath Wood-rush / Luzu...,,"20-40 cm. Caespitose herb with erect, cylindri...",How tall is the plant?,20-40 cm,0.828,0.4
2,241,Euphorbiaceae,Euphorbia serpens Kunth,Schlangen-Wolfsmilch / Matted Sandmat / Euphor...,Syn.: Chamaesyce serpens (Kunth) Small,"Up to 20 cm long. St. prostrate, creeping and ...",What is the height?,20 cm,0.352,0.2
3,434,Dipsacaceae,Scabiosa nitens Roem. & Schult. [Escabiosa-dos...,Azoren-Skabiose /Azores Scabious /Scabieuse de...,,"20-40 cm. St. erect, glabrous, woody at base. ...",What is the height?,20-40 cm,0.95,0.4
4,501,Nymphaeaceae,"Nymphaea alba L. [Golfão-branco, Boleira-branc...","Weisse Teichrose/ European white water lily, W...",,"Up to 2.5 m. Aquatic, rhizomatous herb. Lvs. o...",What is the height?,2.5 m,0.677,2.5
5,294,Onagraceae,Fuchsia magellanica Lam. [Brincos-de-princesa],"Magellans Fuchsie / Hummingbird Fuchsia, Hardy...",,"Up to 4 m. Shrub with slender, ± scrambling st...",How tall is the plant?,4 m. Shrub,0.222,4.0
6,135,Papaveraceae,"Papaver rhoeas L. [Papoula-vermelha, Papoula-o...","Klatsch-Mohn / Common Poppy, Corn Poppy / Coqu...",,"20-60 cm. St. erect, ± branched, with white la...",What is the height?,20-60 cm,0.942,0.6
7,490,Asteraceae,"Lactuca watsoniana Trel. [Alfacinha, Alface-do...","Azoren-Lattich, Watsons Lattich/ Watson's Lett...",,"Up to 2 m. St. erect, woody at base, much-bran...",What is the height?,2 m,0.642,2.0
8,333,Oleaceae,Picconia azorica (Tutin) Knobl. [Pau-branco],Azoren-Picconie/ Azores White Wood/Picconia de...,,"Up to 8 m. Evergreen tree with smooth, pale ba...",How tall is the plant?,8 m,0.578,8.0
9,138,Tropaeolaceae,"Tropaeolum majus L. [Chagas, Chagueira, Mastru...","Grosse Kapuzinerkresse / Garden Nasturtium, In...",,30-300 cm. Fleshy vine with twining petioles. ...,What is the height?,30-300 cm,0.877,3.0
