# Libraries & Functions

In [1]:
'''Math & Data Libraries'''
import numpy as np
import pandas as pd

In [11]:
'''ML Libraries'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
def calculate_scores(y_test, y_pred, average = "binary"):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    return [accuracy, precision, recall, f1]

In [4]:
trait_values_dict = {}
trait_values_dict["1.2.1"] = ["herb", "shrub", "tree"]
# trait_values_dict["1.3.1"] = ["obligatory", "terrestrial"]
# trait_values_dict["1.4.1"] = ["obligatory", "self-supporting"]
# trait_values_dict["2.1.1"] = ["annual", "perennial"]
trait_values_dict["2.3.1"] = ["phanerophyte", "chamaephyte", "hemicryptophyte", "cryptophyte", "therophyte"]

trait_names_cat = ["Growth Form", "Life Form"] #"Epiphyte", "Climber", "Lifecycle", 
traits_cat = ["1.2.1", "2.3.1"] #"1.3.1", "1.4.1", "2.1.1", 

In [10]:
focus_names = ["Growth Form", "Life Form"]
focus_codes = ["1.2.1", "2.3.1"]

# Input Data

In [6]:
raw_datasets = dict()

## POWO Dataset

In [7]:
working_dir = "..//Datasets//" 

df_POWO_Cat =  pd.read_excel(working_dir + "POWO_GIFT.xlsx")
df_POWO_Cat_Preproc = df_POWO_Cat.drop_duplicates(subset = ["BERT_description"])
df_POWO_Cat_Preproc = df_POWO_Cat_Preproc[df_POWO_Cat_Preproc["BERT_description"].apply(lambda x: len(x.split(" ")))>10]
raw_datasets["POWO"] = df_POWO_Cat_Preproc

## WIKI Dataset

In [8]:
def fix_WIKI(name, description):
    for n in name.split(" "):
        description = str(description).replace(n.lower(), "")
    return description.strip()

In [9]:
working_dir = "..//Datasets//" 

df_WIKI_Cat =  pd.read_excel(working_dir + "WIKI_GIFT.xlsx")
df_WIKI_Cat_Preproc = df_WIKI_Cat.drop_duplicates(subset = ["BERT_description"])
df_WIKI_Cat_Preproc["BERT_description"] = df_WIKI_Cat_Preproc[["name", "BERT_description"]].apply(lambda x: fix_WIKI(x[0], x[1]), axis = 1)
df_WIKI_Cat_Preproc = df_WIKI_Cat_Preproc[df_WIKI_Cat_Preproc["BERT_description"].apply(lambda x: len(str(x).split(" ")))>10]
raw_datasets["WIKI"] = df_WIKI_Cat_Preproc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_WIKI_Cat_Preproc["BERT_description"] = df_WIKI_Cat_Preproc[["name", "BERT_description"]].apply(lambda x: fix_WIKI(x[0], x[1]), axis = 1)


## Preprocess Datasets

In [12]:
label_map = {
    "Growth Form": {"herb": 0, "shrub": 1, "tree": 2},
    "Life Form": {"phanerophyte": 0, "chamaephyte": 1, "hemicryptophyte": 2, "cryptophyte": 3, "therophyte": 4},
}

In [28]:
preprocessed_dataset_dict = {}
sample_size = 5000
for focus_name, focus_code in zip(focus_names, focus_codes):
    for dataset_name in list(raw_datasets.keys()):
        labelencoder = LabelEncoder()
        
        dataset_masked = raw_datasets[dataset_name][raw_datasets[dataset_name][focus_code].notna()]
        dataset_masked = dataset_masked[dataset_masked[focus_code].apply(lambda x: x in label_map[focus_name].keys())].sample(sample_size)
        dataset_masked[focus_code + "_encoded"] = labelencoder.fit_transform(dataset_masked[focus_code])

        indices_train, indices_test \
            = train_test_split(dataset_masked.index.values, test_size=0.25, random_state=42)
            
        df_train = dataset_masked.loc[indices_train, ["BERT_description", focus_code]]
        df_train.columns = ["text", "labels"]
        df_test = dataset_masked.loc[indices_test, ["BERT_description", focus_code]]
        df_test.columns = ["text", "labels"]
        
        preprocessed_dataset_dict[dataset_name, focus_name, "train"] = df_train
        preprocessed_dataset_dict[dataset_name, focus_name, "validation"] = df_test

# Regex

## Simple Regex Script

In [14]:
model_name = "Regex_Simple"
representation_name = "Regex_Simple"

In [16]:
regex_keywords_dict = {}
for trait in trait_values_dict["1.2.1"]:
    regex_keywords_dict["1.2.1", trait] = [i for i in [trait]]

for trait in trait_values_dict["2.3.1"]:
    regex_keywords_dict["2.3.1", trait] = [i for i in [trait]]


In [29]:
tmp_tmp_list = []
for focus_name, focus_code in zip(trait_names_cat, traits_cat):
    print("Trait:", focus_name)
    for dataset_name in list(raw_datasets.keys()):
        print("\tDataset:", dataset_name)
          
        y = {trait: 1*preprocessed_dataset_dict[dataset_name, focus_name, "validation"]["labels"].apply(lambda x: trait == x) for trait in trait_values_dict[focus_code]}

        y_test = dict()
        for i, trait_value in enumerate(trait_values_dict[focus_code]):
            # print(i, trait_value)
            y_test[dataset_name, focus_name, trait_value] = y[trait_value]
        
        tmp_list = []
        for i, trait_value in enumerate(trait_values_dict[focus_code]):
            # print(trait_value, i)
            
            y_predict_class = np.zeros(y_test[dataset_name, focus_name, trait_value].shape)
            for keyword in regex_keywords_dict[focus_code, trait_value]:
                y_predict_class += pd.Series(preprocessed_dataset_dict[dataset_name, focus_name, "validation"]["text"]).apply(lambda x: keyword in x.split(" "))
            y_predict_class = np.where(y_predict_class>0, 1, 0)
            y_test_class = y_test[dataset_name, focus_name, trait_value]
    
            acc_gift = accuracy_score(y_test_class, y_predict_class)
            prec_gift = precision_score(y_test_class, y_predict_class)
            rec_gift = recall_score(y_test_class, y_predict_class)
            f1_gift = f1_score(y_test_class, y_predict_class)

            results = [acc_gift, prec_gift, rec_gift, f1_gift]
            tmp_list.append([dataset_name, dataset_name, focus_name, trait_value] + results + [model_name, representation_name])

        tmp_list.append([dataset_name, dataset_name, focus_name, focus_name] + list(np.mean(np.array(tmp_list)[:, 4:8].astype(float), axis=0)) + [model_name, representation_name])
   
        tmp_tmp_list.append(tmp_list)

Trait: Growth Form
	Dataset: POWO
	Dataset: WIKI
Trait: Life Form
	Dataset: POWO
	Dataset: WIKI


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
from itertools import chain
unfolded = list(chain.from_iterable(i if isinstance(i, list) else [i] for i in tmp_tmp_list))
df_results = pd.DataFrame(unfolded, columns=["Train Dataset", "Test Dataset", "Trait", "Trait Value", "Accuracy", "Precision", "Recall", "F1-Score", "Model", "Representation"])
df_results

Unnamed: 0,Train Dataset,Test Dataset,Trait,Trait Value,Accuracy,Precision,Recall,F1-Score,Model,Representation
0,POWO,POWO,Growth Form,herb,0.5,0.920981,0.361884,0.5196,Regex_Simple,Regex_Simple
1,POWO,POWO,Growth Form,shrub,0.8744,0.543046,0.482353,0.510903,Regex_Simple,Regex_Simple
2,POWO,POWO,Growth Form,tree,0.9528,0.837209,0.739726,0.785455,Regex_Simple,Regex_Simple
3,POWO,POWO,Growth Form,Growth Form,0.775733,0.767079,0.527988,0.605319,Regex_Simple,Regex_Simple
4,WIKI,WIKI,Growth Form,herb,0.6168,0.954286,0.261755,0.410824,Regex_Simple,Regex_Simple
5,WIKI,WIKI,Growth Form,shrub,0.8928,0.783784,0.669231,0.721992,Regex_Simple,Regex_Simple
6,WIKI,WIKI,Growth Form,tree,0.8392,0.861244,0.511364,0.641711,Regex_Simple,Regex_Simple
7,WIKI,WIKI,Growth Form,Growth Form,0.782933,0.866438,0.480783,0.591509,Regex_Simple,Regex_Simple
8,POWO,POWO,Life Form,phanerophyte,0.7768,0.0,0.0,0.0,Regex_Simple,Regex_Simple
9,POWO,POWO,Life Form,chamaephyte,0.9176,0.0,0.0,0.0,Regex_Simple,Regex_Simple


### Save Results

In [31]:
df_results.to_excel("Results//Regex_Simple_Results.xlsx", index=False)