**Notebook to perform categorical trait classification based on a keyword search.**

The datasets are first split into a training and test dataset and classification is performed based on two dictionaries:
- The Simple Regex Search classifies the description to a trait class if that trait class is explicitely mentioned in the species description. Example: "tree, perennial, up to 10m high" will be classified to a Growth form = tree. 
- The Advanced Regex Search expands on this by also using other expert made rules for the classifications.

# Libraries & Functions

In [1]:
'''Math & Data Libraries'''
import numpy as np
import pandas as pd

In [2]:
'''ML Libraries'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [1]:
def calculate_scores(y_test, y_pred, average = "binary"):
    """
    Calculate the accuracy, precision, recall and F1-score of prediction. 
    ---
    Parameters
    ----------
    y_test : np.array
        true trait values of the test set descriptions. 
    y_pred : np.array
        predicted trait values for the test set descriptions. 
    average : one of "binary", "macro", "micro"
        how to average trait scores for the precision, recall and f1 scores. Default to "macro" for multi-class  
    Returns
    -------
    [accuracy, precision, recall, f1] : list
        List containing the accuracy, precision, recall and F1-score of prediction
    """
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)

    return [accuracy, precision, recall, f1]

In [6]:
trait_values_dict = {}
trait_values_dict["1.2.1"] = ["herb", "shrub", "tree"]
trait_values_dict["1.3.1"] = ["obligatory", "terrestrial"]
trait_values_dict["1.4.1"] = ["obligatory", "self-supporting"]
trait_values_dict["2.1.1"] = ["annual", "perennial"]
trait_values_dict["2.3.1"] = ["phanerophyte", "chamaephyte", "hemicryptophyte", "cryptophyte", "therophyte"]

trait_names_cat = ["Growth Form", "Epiphyte", "Climber", "Lifecycle", "Life Form"]
traits_cat = ["1.2.1", "1.3.1", "1.4.1", "2.1.1", "2.3.1"]

# Input Data

## Plants of the World Online - POWO GIFT

In [7]:
df_POWO_GIFT = pd.read_excel("..//Data//Final Databases//POWO_GIFT.xlsx")

## Wikipedia - WIKI GIFT

In [8]:
df_WIKI_GIFT = pd.read_excel("..//Data//Final Databases//WIKI_GIFT.xlsx")
df_WIKI_GIFT = df_WIKI_GIFT[~df_WIKI_GIFT["BOW_description"].isna()]

# Regex

## Text Representation - BERT
For the regex classification we use the BERT preprocessing of the description, however as the keywords we search for fall aren't stop words, we can also freely use the BOW preprocessing as well. 

In [12]:
df_names = ["POWO", "WIKI"]
df_list = [df_POWO_GIFT, df_WIKI_GIFT]

corpus = dict()
for df_name, df in zip(df_names, df_list): 
    corpus[df_name] = df["BERT_description"].values

## Split Data

In [13]:
X_train = {}
X_test = {}

y_train = {}
y_test = {}

for df_name, df in zip(["POWO", "WIKI"], [df_POWO_GIFT, df_WIKI_GIFT]): 
  for focus_name, focus_code in zip(trait_names_cat, traits_cat):
      
    trait_mask = df[focus_code].notna()
    y = {trait: 1*df[trait_mask][focus_code].apply(lambda x: trait == x) for trait in trait_values_dict[focus_code]}

    X_train[df_name, focus_name], X_test[df_name, focus_name], \
    indices_train, indices_test \
    = train_test_split(corpus[df_name][trait_mask], 
                      np.arange(sum(trait_mask)), test_size=0.25, random_state=42) # We split the dataset to a training set of 75% and test set of 25%

    y_train[df_name, focus_name, focus_name] = np.zeros(len(X_train[df_name, focus_name]), dtype=int)
    y_test[df_name, focus_name, focus_name] = np.zeros(len(X_test[df_name, focus_name]), dtype=int)

    # On top of the original split we also create binary datasets for each trait value which we will use to evaluate class performance
    for i, trait_value in enumerate(trait_values_dict[focus_code]):
      y_train[df_name, focus_name, trait_value] = y[trait_value].values[indices_train]
      y_test[df_name, focus_name, trait_value] = y[trait_value].values[indices_test]

      y_train[df_name, focus_name, focus_name] += y_train[df_name, focus_name, trait_value] * i
      y_test[df_name, focus_name, focus_name] += y_test[df_name, focus_name, trait_value] * i

## Simple Regex Script

In [16]:
model_name = "Regex_Simple"
representation_name = "Regex_Simple"

In [27]:
regex_keywords_dict = {}
# Growth Form
for trait in trait_values_dict["1.2.1"]:
    regex_keywords_dict["1.2.1", trait] = [i for i in [trait]]

# Epiphytism    
for trait in trait_values_dict["1.3.1"]:
    regex_keywords_dict["1.3.1", trait] = [i for i in [trait]]
regex_keywords_dict["1.3.1", "obligatory"] = ["obligatory", "epiphyte"]

# Climber
for trait in trait_values_dict["1.4.1"]:
    regex_keywords_dict["1.4.1", trait] = [i for i in [trait]]
regex_keywords_dict["1.4.1", "obligatory"] = ["obligatory", "climber", "climbing"]

# Lifecycle
for trait in trait_values_dict["2.1.1"]:
    regex_keywords_dict["2.1.1", trait] = [i for i in [trait]]

# Life form
for trait in trait_values_dict["2.3.1"]:
    regex_keywords_dict["2.3.1", trait] = [i for i in [trait]]

In [28]:
tmp_tmp_list = []
for focus_name, focus_code in zip(trait_names_cat, traits_cat):
    print("Trait:", focus_name)
    for df_name_train, df_train in zip(df_names[:], df_list[:]):
        print("\tDataset:", df_name_train)

        df_name_test = df_name_train
        df_test = df_train
          
        tmp_list = []

        for i, trait_value in enumerate(trait_values_dict[focus_code]):
            
            y_predict_class = np.zeros(y_test[df_name_test, focus_name, trait_value].shape)
            for keyword in regex_keywords_dict[focus_code, trait_value]:
                y_predict_class += pd.Series(X_test[df_name_test, focus_name]).apply(lambda x: keyword in x.split(" "))
            y_predict_class = np.where(y_predict_class>0, 1, 0)
            y_test_class = y_test[df_name_test, focus_name, trait_value]
    
            acc_gift = accuracy_score(y_test_class, y_predict_class)
            prec_gift = precision_score(y_test_class, y_predict_class)
            rec_gift = recall_score(y_test_class, y_predict_class)
            f1_gift = f1_score(y_test_class, y_predict_class)

            results = [acc_gift, prec_gift, rec_gift, f1_gift]
            tmp_list.append([df_name_train, df_name_test, focus_name, trait_value] + results + [model_name, representation_name])
            
        # The macro average scores are calculated as the average across all individual trait values 
        tmp_list.append([df_name_train, df_name_test, focus_name, focus_name] + list(np.mean(np.array(tmp_list)[:, 4:8].astype(float), axis=0)) + [model_name, representation_name])
   
        tmp_tmp_list.append(tmp_list)

Trait: Growth Form
	Dataset: POWO
	Dataset: WIKI
Trait: Epiphyte
	Dataset: POWO
	Dataset: WIKI
Trait: Climber
	Dataset: POWO


  _warn_prf(average, modifier, msg_start, len(result))


	Dataset: WIKI


  _warn_prf(average, modifier, msg_start, len(result))


Trait: Lifecycle
	Dataset: POWO
	Dataset: WIKI
Trait: Life Form
	Dataset: POWO


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


	Dataset: WIKI


In [21]:
from itertools import chain
unfolded = list(chain.from_iterable(i if isinstance(i, list) else [i] for i in tmp_tmp_list))
df_results = pd.DataFrame(unfolded, columns=["Train Dataset", "Test Dataset", "Trait", "Trait Value", "Accuracy", "Precision", "Recall", "F1-Score", "Model", "Representation"])
df_results

Unnamed: 0,Train Dataset,Test Dataset,Trait,Trait Value,Accuracy,Precision,Recall,F1-Score,Model,Representation
0,POWO,POWO,Growth Form,herb,0.72467,0.947805,0.567631,0.710031,Regex_Simple,Regex_Simple
1,POWO,POWO,Growth Form,shrub,0.867453,0.649621,0.513986,0.573898,Regex_Simple,Regex_Simple
2,POWO,POWO,Growth Form,tree,0.932686,0.94707,0.751124,0.837793,Regex_Simple,Regex_Simple
3,POWO,POWO,Growth Form,Growth Form,0.841603,0.848165,0.610914,0.707241,Regex_Simple,Regex_Simple
4,WIKI,WIKI,Growth Form,herb,0.631056,0.968273,0.259993,0.409919,Regex_Simple,Regex_Simple
5,WIKI,WIKI,Growth Form,shrub,0.885174,0.772871,0.651348,0.706925,Regex_Simple,Regex_Simple
6,WIKI,WIKI,Growth Form,tree,0.844477,0.892414,0.534269,0.668388,Regex_Simple,Regex_Simple
7,WIKI,WIKI,Growth Form,Growth Form,0.786902,0.877853,0.48187,0.595077,Regex_Simple,Regex_Simple
8,POWO,POWO,Epiphyte,obligatory,0.965871,0.864337,0.772978,0.816109,Regex_Simple,Regex_Simple
9,POWO,POWO,Epiphyte,terrestrial,0.123818,0.802632,0.024612,0.047759,Regex_Simple,Regex_Simple


### Save Results

In [22]:
df_results.to_excel("..//Data//Results//Regex_Simple_Results.xlsx", index=False)

## Advanced Regex Script

In [29]:
model_name = "Regex_Advanced"
representation_name = "Regex_Advanced"

In [30]:
regex_keywords_dict = {}
regex_keywords_dict["1.2.1", "herb"] = ["herb", "herbaceousperennial", "forb", "orchid"]
regex_keywords_dict["1.2.1", "shrub"] = ["subshrub", "undershrub", "subshrub", "bush", "shrub", "shrublet", "cactus"]
regex_keywords_dict["1.2.1", "tree"] = ["tree", "mallet"]
    
for trait in trait_values_dict["1.3.1"]:
    regex_keywords_dict["1.3.1", trait] = [i for i in [trait]]
regex_keywords_dict["1.3.1", "obligatory"] = ["obligatory", "epiphyte", "epiphytic"]

# Climber
for trait in trait_values_dict["1.4.1"]:
    regex_keywords_dict["1.4.1", trait] = [i for i in [trait]]
regex_keywords_dict["1.4.1", "obligatory"] = ["obligatory", "vine", "liana", "liane", \
                                              "woodytwiner", "shrubbytwiner", "woodyclimber", "twiner"]

for trait in trait_values_dict["2.1.1"]:
    regex_keywords_dict["2.1.1", trait] = [i for i in [trait]]

for trait in trait_values_dict["2.3.1"]:
    regex_keywords_dict["2.3.1", trait] = [i for i in [trait]]

In [31]:
tmp_tmp_list = []
for focus_name, focus_code in zip(trait_names_cat, traits_cat):
    print("Trait:", focus_name)
    for df_name_train, df_train in zip(df_names[:], df_list[:]):
        print("\tDataset:", df_name_train)

        df_name_test = df_name_train
        df_test = df_train
          
        tmp_list = []

        for i, trait_value in enumerate(trait_values_dict[focus_code]):
            print(trait_value, i)
            
            y_predict_class = np.zeros(y_test[df_name_test, focus_name, trait_value].shape)
            for keyword in regex_keywords_dict[focus_code, trait_value]:
                y_predict_class += pd.Series(X_test[df_name_test, focus_name]).apply(lambda x: keyword in x.split(" "))
            y_predict_class = np.where(y_predict_class>0, 1, 0)
            y_test_class = y_test[df_name_test, focus_name, trait_value]
    
            acc_gift = accuracy_score(y_test_class, y_predict_class)
            prec_gift = precision_score(y_test_class, y_predict_class)
            rec_gift = recall_score(y_test_class, y_predict_class)
            f1_gift = f1_score(y_test_class, y_predict_class)

            results = [acc_gift, prec_gift, rec_gift, f1_gift]
            tmp_list.append([df_name_train, df_name_test, focus_name, trait_value] + results + [model_name, representation_name])

        tmp_list.append([df_name_train, df_name_test, focus_name, focus_name] + list(np.mean(np.array(tmp_list)[:, 4:8].astype(float), axis=0)) + ["Regex", "Regex"])
   
        tmp_tmp_list.append(tmp_list)

Trait: Growth Form
	Dataset: POWO
herb 0
shrub 1
tree 2
	Dataset: WIKI
herb 0
shrub 1
tree 2
Trait: Epiphyte
	Dataset: POWO
obligatory 0
terrestrial 1
	Dataset: WIKI
obligatory 0
terrestrial 1
Trait: Climber
	Dataset: POWO
obligatory 0
self-supporting 1
	Dataset: WIKI
obligatory 0


  _warn_prf(average, modifier, msg_start, len(result))


self-supporting 1
Trait: Lifecycle
	Dataset: POWO
annual 0
perennial 1


  _warn_prf(average, modifier, msg_start, len(result))


	Dataset: WIKI
annual 0
perennial 1
Trait: Life Form
	Dataset: POWO
phanerophyte 0
chamaephyte 1
hemicryptophyte 2
cryptophyte 3
therophyte 4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


	Dataset: WIKI
phanerophyte 0
chamaephyte 1
hemicryptophyte 2
cryptophyte 3
therophyte 4


In [32]:
unfolded = list(chain.from_iterable(i if isinstance(i, list) else [i] for i in tmp_tmp_list))
df_results = pd.DataFrame(unfolded, columns=["Train Dataset", "Test Dataset", "Trait", "Trait Value", "Accuracy", "Precision", "Recall", "F1-Score", "Model", "Representation"])
df_results

Unnamed: 0,Train Dataset,Test Dataset,Trait,Trait Value,Accuracy,Precision,Recall,F1-Score,Model,Representation
0,POWO,POWO,Growth Form,herb,0.724931,0.947625,0.568215,0.710437,Regex_Advanced,Regex_Advanced
1,POWO,POWO,Growth Form,shrub,0.86728,0.592044,0.758242,0.664915,Regex_Advanced,Regex_Advanced
2,POWO,POWO,Growth Form,tree,0.932599,0.946623,0.751124,0.837618,Regex_Advanced,Regex_Advanced
3,POWO,POWO,Growth Form,Growth Form,0.841603,0.828764,0.692527,0.737657,Regex,Regex
4,WIKI,WIKI,Growth Form,herb,0.679425,0.958727,0.365334,0.529063,Regex_Advanced,Regex_Advanced
5,WIKI,WIKI,Growth Form,shrub,0.882106,0.74106,0.68477,0.711804,Regex_Advanced,Regex_Advanced
6,WIKI,WIKI,Growth Form,tree,0.844961,0.89271,0.535921,0.669763,Regex_Advanced,Regex_Advanced
7,WIKI,WIKI,Growth Form,Growth Form,0.802164,0.864166,0.528675,0.636877,Regex,Regex
8,POWO,POWO,Epiphyte,obligatory,0.969203,0.841575,0.844669,0.843119,Regex_Advanced,Regex_Advanced
9,POWO,POWO,Epiphyte,terrestrial,0.123818,0.802632,0.024612,0.047759,Regex_Advanced,Regex_Advanced


### Save Results

In [33]:
df_results.to_excel("..//Data//Results//Regex_Advanced_Results.xlsx", index=False)