# Libraries & Functions

In [1]:
'''Math & Data Libraries'''
import numpy as np
import pandas as pd

In [2]:
''' Miscellaneous Libraries'''
from tqdm import tqdm
from collections import Counter

In [3]:
'''NLP Libraries'''
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

In [4]:
'''String Libraries'''
import string
import re

In [5]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

font = {'family': 'serif',
        'color':  '#333333',
        'weight': 'normal',
        'size': 14,
        }

## QA Functions

In [6]:
def QA_Prediction(Questions, Description, model_pipeline):
    if not isinstance(Description, str):
        return "", "No Description", 0 
    
    answer_list = []
    score_list = []
    if(not any(map(str.isdigit, Description))):
        return "", "No Number", 0
    
    for q_i, question in enumerate(Questions):
        QA_input = {
        'question': question,
        'context': Description
        }
        res = model_pipeline(QA_input)
        answer = res["answer"]
        score = np.round(res["score"], 3)# if contain_check else 0
        answer_list.append(answer)
        score_list.append(score)
        
    best_answer_i = np.argmax(score_list)
    best_question = Questions[best_answer_i]
    best_answer = answer_list[best_answer_i]
    best_score = score_list[best_answer_i]
    
    return best_question, best_answer, best_score

In [7]:
def is_float(element):
    try:
        float(element)
        return True
    except ValueError:
        return False

In [20]:
def post_process_answer_height(answer):
    available_units = ('mm', 'cm', 'm', 'km', 'inches', 'ft', 'yds', 'miles')
    conversions = (1, 10, 1000, 1e6, 25.4, 304.8, 914.4, 1.609344e6)
    conversion_dict = {unit:rate for unit, rate in zip(available_units, conversions)}
    
    flag = 0
    answer_punc = answer.translate(str.maketrans('', '', string.punctuation))
    for unit in available_units:
        if(unit in answer_punc.split(" ")):
            flag = 1
            metric = unit
            break
            
    if(flag==0):
        return "No metric"
    result = []

    answer = re.sub("\(.*?\)","",answer)

    answer = answer.replace("-", " ") 
    counter = 0
    for part in answer.split(" "):
        if(counter>2):
            return "Too many numbers"
        if(is_float(part)):
            tmp = str(np.round(float(part) * conversion_dict[metric]/1000, 4))
            counter += 1
            result.append(tmp)
    return " ".join(result)


def post_process_answer_leaf_length(answer):
    available_units = ('mm', 'cm', 'm', 'km', 'inches', 'ft', 'yds', 'miles')
    conversions = (1, 10, 1000, 1e6, 25.4, 304.8, 914.4, 1.609344e6)
    conversion_dict = {unit:rate for unit, rate in zip(available_units, conversions)}
    
    flag = 0
    answer_punc = answer.translate(str.maketrans('', '', string.punctuation))
    for unit in available_units:
        if(unit in answer_punc.split(" ")):
            flag = 1
            metric = unit
            
    if(flag==0):
        return "No metric"
    result = []

    if("x" in answer):
        answer = answer.split("x")[0]
    
    answer = re.sub("\(.*?\)","",answer)

    answer = answer.replace("-", " ") 
    counter = 0
    for part in answer.split(" "):
        if(counter>2):
            return "Too many numbers"
        if(is_float(part)):
            tmp = str(np.round(float(part) * conversion_dict[metric]/10, 4))
            counter += 1
            result.append(tmp)
    return " ".join(result)

def post_process_answer_leaf_width(answer):
    available_units = ('mm', 'cm', 'm', 'km', 'inches', 'ft', 'yds', 'miles')
    conversions = (1, 10, 1000, 1e6, 25.4, 304.8, 914.4, 1.609344e6)
    conversion_dict = {unit:rate for unit, rate in zip(available_units, conversions)}
    
    flag = 0
    answer_punc = answer.translate(str.maketrans('', '', string.punctuation))
    for unit in available_units:
        if(unit in answer_punc.split(" ")):
            flag = 1
            metric = unit
            
    if(flag==0):
        return "No metric"
    result = []

    if("x" in answer):
        answer = answer.split("x")[1]
    
    answer = re.sub("\(.*?\)","",answer)

    answer = answer.replace("-", " ") 
    counter = 0
    for part in answer.split(" "):
        if(counter>2):
            return "Too many numbers"
        if(is_float(part)):
            tmp = str(np.round(float(part) * conversion_dict[metric]/10, 4))
            counter += 1
            result.append(tmp)
    return " ".join(result)

def post_post_process_answer(answer):
    answer_parts = answer.split(" ")
    if(len(answer_parts)==1 and is_float(answer_parts[0])):
        return float(answer_parts[0])
    if(len(answer_parts)==2 and is_float(answer_parts[0]) and is_float(answer_parts[1])):
        return float(answer_parts[1])
    return -1

# Input Data

In [8]:
df_dict = {}

## Plants of the World Online - POWO GIFT

In [11]:
df_dict["POWO"] = pd.read_excel("..//Datasets//POWO_GIFT.xlsx")

# Models

In [12]:
model_dict = {}

## Custom NUM BERT

In [14]:
model_name = "ViktorDo/bert-finetuned-custom_Numerical_Traits"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
model_dict["CustomBERT"] = nlp

Downloading:   0%|          | 0.00/431M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/315 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

# QA Predictions

In [15]:
questions_dict = dict()
predictions_dict = dict()
post_predictions_dict = dict()
pred_mask_dict = dict()
score_dict = dict()
mask_dict = dict()
true_dict = dict()
description_dict = dict()
Questions = dict()

## Plant Height Max

In [16]:
focus_name = "Plant Height Max"
focus_code = "1.6.2"

description_column = "QA_description"
questions = ["How tall is the plant?", "What is the height?"]

Questions[focus_name] = questions

In [18]:
for dataset in ["POWO"]:
    mask_dict[focus_name] = df_dict[dataset][focus_code].notna()
    print("{} Number of Species with {} Information: {}/{} ({}%)".format(dataset, focus_name, np.sum(mask_dict[focus_name]), len(df_dict[dataset][focus_code]), np.round(np.sum(mask_dict[focus_name])/len(df_dict[dataset][focus_code]),2 )))

POWO Number of Species with Plant Height Max Information: 17648/59151 (0.3%)


In [19]:
for dataset in ["POWO"]:
    for model in ["CustomBERT"]: 
        question_list = []
        answer_list = []
        score_list = []
        true_list = []
        description_list = []

        mask_dict[focus_name] = df_dict[dataset][focus_code].notna()

        for i, (description, trait_value) in tqdm(enumerate(df_dict[dataset][mask_dict[focus_name]][[description_column, focus_code]].values)):
            ques, ans, score = QA_Prediction(Questions[focus_name], description, model_dict[model])
            question_list.append(ques)
            answer_list.append(ans)
            score_list.append(score)
            true_list.append(trait_value)
            description_list.append(description)

        post_predictions = [post_process_answer_height(ans) for ans in answer_list]
        post_post_predictions = np.array([post_post_process_answer(ans) for ans in post_predictions])
        pred_mask_v2 = post_post_predictions!=-1
        
        for var, data in zip(["Questions", "Answers", "Scores", "Predictions"], [question_list, answer_list, score_list, post_post_predictions]):
            df_dict[dataset].loc[:, focus_code + "_" + var + "_" + model] = ""
            df_dict[dataset].loc[mask_dict[focus_name], focus_code + "_" + var + "_" + model] = data

17648it [9:00:06,  1.84s/it]


NameError: name 'post_process_answer_height' is not defined

## Leaf Length Max

In [23]:
focus_name = "Leaf Length Max"
focus_code = "4.6.2"

description_column = "QA_description"
questions = ["How long is the leaf?", "What is the leaf length?"]

Questions[focus_name] = questions

In [24]:
for dataset in ["POWO"]:
    mask_dict[focus_name] = df_dict[dataset][focus_code].notna()
    print("{} Number of Species with {} Information: {}/{} ({}%)".format(dataset, focus_name, np.sum(mask_dict[focus_name]), len(df_dict[dataset][focus_code]), np.round(np.sum(mask_dict[focus_name])/len(df_dict[dataset][focus_code]),2 )))

POWO Number of Species with Leaf Length Max Information: 3397/59151 (0.06%)


In [25]:
for dataset in ["POWO"]:
    for model in ["CustomBERT"]: 
        question_list = []
        answer_list = []
        score_list = []
        true_list = []
        description_list = []

        mask_dict[focus_name] = df_dict[dataset][focus_code].notna()

        for i, (description, trait_value) in tqdm(enumerate(df_dict[dataset][mask_dict[focus_name]][[description_column, focus_code]].values)):
            ques, ans, score = QA_Prediction(Questions[focus_name], description, model_dict[model])
            question_list.append(ques)
            answer_list.append(ans)
            score_list.append(score)
            true_list.append(trait_value)
            description_list.append(description)

        post_predictions = [post_process_answer_leaf_length(ans) for ans in answer_list]
        post_post_predictions = np.array([post_post_process_answer(ans) for ans in post_predictions])
        pred_mask_v2 = post_post_predictions!=-1
        
        for var, data in zip(["Questions", "Answers", "Scores", "Predictions"], [question_list, answer_list, score_list, post_post_predictions]):
            df_dict[dataset].loc[:, focus_code + "_" + var + "_" + model] = ""
            df_dict[dataset].loc[mask_dict[focus_name], focus_code + "_" + var + "_" + model] = data

3397it [2:49:57,  3.00s/it]


## Leaf Width Max

In [26]:
focus_name = "Leaf Width Max"
focus_code = "4.7.2"

description_column = "QA_description"
questions = ["How wide is the leaf?", "What is the leaf width?"]

Questions[focus_name] = questions

In [27]:
for dataset in ["POWO"]:
    mask_dict[focus_name] = df_dict[dataset][focus_code].notna()
    print("{} Number of Species with {} Information: {}/{} ({}%)".format(dataset, focus_name, np.sum(mask_dict[focus_name]), len(df_dict[dataset][focus_code]), np.round(np.sum(mask_dict[focus_name])/len(df_dict[dataset][focus_code]),2 )))

POWO Number of Species with Leaf Width Max Information: 2243/59151 (0.04%)


In [28]:
for dataset in ["POWO"]:
    for model in ["CustomBERT"]: 
        question_list = []
        answer_list = []
        score_list = []
        true_list = []
        description_list = []

        mask_dict[focus_name] = df_dict[dataset][focus_code].notna()

        for i, (description, trait_value) in tqdm(enumerate(df_dict[dataset][mask_dict[focus_name]][[description_column, focus_code]].values)):
            ques, ans, score = QA_Prediction(Questions[focus_name], description, model_dict[model])
            question_list.append(ques)
            answer_list.append(ans)
            score_list.append(score)
            true_list.append(trait_value)
            description_list.append(description)

        post_predictions = [post_process_answer_leaf_width(ans) for ans in answer_list]
        post_post_predictions = np.array([post_post_process_answer(ans) for ans in post_predictions])
        pred_mask_v2 = post_post_predictions!=-1
        
        for var, data in zip(["Questions", "Answers", "Scores", "Predictions"], [question_list, answer_list, score_list, post_post_predictions]):
            df_dict[dataset].loc[:, focus_code + "_" + var + "_" + model] = ""
            df_dict[dataset].loc[mask_dict[focus_name], focus_code + "_" + var + "_" + model] = data

2243it [1:32:11,  2.47s/it]


### Save Results

In [29]:
for dataset in ["POWO"]:
    df_dict[dataset].to_excel(f"Results//{dataset}_Numerical_Predictions_CustomBERT.xlsx", index=False)