# Libraries & Functions

In [7]:
'''Math & Data Libraries'''
import numpy as np
import pandas as pd

In [8]:
'''ML Libraries'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [9]:
def calculate_scores(y_test, y_pred, average = "binary"):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    return [accuracy, precision, recall, f1]

In [10]:
trait_values_dict = {}
trait_values_dict["1.2.1"] = ["herb", "shrub", "tree"]
# trait_values_dict["1.3.1"] = ["obligatory", "terrestrial"]
# trait_values_dict["1.4.1"] = ["obligatory", "self-supporting"]
# trait_values_dict["2.1.1"] = ["annual", "perennial"]
trait_values_dict["2.3.1"] = ["phanerophyte", "chamaephyte", "hemicryptophyte", "cryptophyte", "therophyte"]

trait_names_cat = ["Growth Form", "Life Form"] #"Epiphyte", "Climber", "Lifecycle", 
traits_cat = ["1.2.1", "2.3.1"] #"1.3.1", "1.4.1", "2.1.1", 

In [11]:
trait_values_dict["ESP", "1.2.1"] = ["hierba", "arbusto", "árbol"]
# trait_values_dict["ESP", "1.3.1"] = ["obligatorio", "terrestre"]
# trait_values_dict["ESP", "1.4.1"] = ["obligatorio", "autoportante"]
# trait_values_dict["ESP", "2.1.1"] = ["anual", "perenne"]
trait_values_dict["ESP", "2.3.1"] = ["fanerófito", "chamaefita", "hemicriptófito", "criptofita", "terófito"]

In [12]:
trait_values_dict["DE", "1.2.1"] = ["kraut", "strauch", "baum"]
# trait_values_dict["DE", "1.3.1"] = ["obligatorisch", "terrestrisch"]
# trait_values_dict["DE", "1.4.1"] = ["obligatorisch", "selbsttragend"]
# trait_values_dict["DE", "2.1.1"] = ["jährlich", "mehrjährig"]
trait_values_dict["DE", "2.3.1"] = ["phanerophyt", "chamaephyt", "hemikryptophyt", "kryptophyt", "therophyt"]


# Input Data

## Spanish Wikipedia - WIKI_GIFT_ESP Dataset

In [13]:
df_WIKI_GIFT_ESP = pd.read_excel("../Datasets//WIKI_GIFT_ESP.xlsx")
df_WIKI_GIFT_ESP = df_WIKI_GIFT_ESP[~df_WIKI_GIFT_ESP["BOW_description"].isna()]

## German Wikipedia - WIKI_GIFT_DE Dataset

In [14]:
df_WIKI_GIFT_DE = pd.read_excel("../Datasets//WIKI_GIFT_DE.xlsx")
df_WIKI_GIFT_DE = df_WIKI_GIFT_DE[~df_WIKI_GIFT_DE["BOW_description"].isna()]

# Regex

## Text Representation - BERT

In [15]:
df_names = ["WIKI_ESP", "WIKI_DE"]
df_list = [df_WIKI_GIFT_ESP, df_WIKI_GIFT_DE]

corpus = dict()
for df_name, df in zip(df_names, df_list): 
    corpus[df_name] = df["BERT_description"].values

## Split Data

In [16]:
X_train = {}
X_test = {}

y_train = {}
y_test = {}

for df_name, df in zip(df_names, df_list): 
  for focus_name, focus_code in zip(trait_names_cat, traits_cat):
      
    trait_mask = df[focus_code].notna()
    y = {trait: 1*df[trait_mask][focus_code].apply(lambda x: trait == x) for trait in trait_values_dict[focus_code]}

    X_train[df_name, focus_name], X_test[df_name, focus_name], \
    indices_train, indices_test \
    = train_test_split(corpus[df_name][trait_mask], 
                      np.arange(sum(trait_mask)), test_size=0.25, random_state=42)

    y_train[df_name, focus_name, focus_name] = np.zeros(len(X_train[df_name, focus_name]), dtype=int)
    y_test[df_name, focus_name, focus_name] = np.zeros(len(X_test[df_name, focus_name]), dtype=int)

    for i, trait_value in enumerate(trait_values_dict[focus_code]):
      # print(i, trait_value)
      y_train[df_name, focus_name, trait_value] = y[trait_value].values[indices_train]
      y_test[df_name, focus_name, trait_value] = y[trait_value].values[indices_test]

      y_train[df_name, focus_name, focus_name] += y_train[df_name, focus_name, trait_value] * i
      y_test[df_name, focus_name, focus_name] += y_test[df_name, focus_name, trait_value] * i

## Simple Regex Script

In [17]:
model_name = "Regex_Simple"
representation_name = "Regex_Simple"

In [18]:
regex_keywords_dict = {}
for lang in ["ESP", "DE"]:
    for trait_en, trait_esp in zip(trait_values_dict["1.2.1"], trait_values_dict[lang, "1.2.1"]):
        regex_keywords_dict[lang, "1.2.1", trait_en] = [i for i in [trait_esp]]

    # for trait_en, trait_esp in zip(trait_values_dict["1.3.1"], trait_values_dict[lang, "1.3.1"]):
    #     regex_keywords_dict[lang, "1.3.1", trait_en] = [i for i in [trait_esp]]

    # for trait_en, trait_esp in zip(trait_values_dict["1.4.1"], trait_values_dict[lang, "1.4.1"]):
    #     regex_keywords_dict[lang, "1.4.1", trait_en] = [i for i in [trait_esp]]

    # for trait_en, trait_esp in zip(trait_values_dict["2.1.1"], trait_values_dict[lang, "2.1.1"]):
    #     regex_keywords_dict[lang, "2.1.1", trait_en] = [i for i in [trait_esp]]

    for trait_en, trait_esp in zip(trait_values_dict["2.3.1"], trait_values_dict[lang, "2.3.1"]):
        regex_keywords_dict[lang, "2.3.1", trait_en] = [i for i in [trait_esp]]

regex_keywords_dict["DE", "1.2.1", "shrub"] = ["strauch", "busch"]    
# regex_keywords_dict["ESP", "1.3.1", "obligatory"] = ["obligatorio", "epífita"]
# regex_keywords_dict["DE", "1.3.1", "obligatory"] = ["obligatorio", "epiphyt"]
# regex_keywords_dict["ESP", "1.4.1", "obligatory"] = ["obligatorio", "trepadora", "escalada"]
# regex_keywords_dict["DE", "1.4.1", "obligatory"] = ["obligatorisch", "letterpflanze", "kletterer"]

In [19]:
tmp_tmp_list = []
for focus_name, focus_code in zip(trait_names_cat, traits_cat):
    print("Trait:", focus_name)
    for df_name_train, df_train in zip(df_names[:], df_list[:]):
        lang = df_name_train.split("_")[-1]
        print("\tDataset:", df_name_train)

        df_name_test = df_name_train
        df_test = df_train
          
        tmp_list = []

        for i, trait_value in enumerate(trait_values_dict[focus_code]):            
            y_predict_class = np.zeros(y_test[df_name_test, focus_name, trait_value].shape)
            for keyword in regex_keywords_dict[lang, focus_code, trait_value]:
                y_predict_class += pd.Series(X_test[df_name_test, focus_name]).apply(lambda x: keyword in x.split(" "))
            y_predict_class = np.where(y_predict_class>0, 1, 0)
            y_test_class = y_test[df_name_test, focus_name, trait_value]
    
            acc_gift = accuracy_score(y_test_class, y_predict_class)
            prec_gift = precision_score(y_test_class, y_predict_class)
            rec_gift = recall_score(y_test_class, y_predict_class)
            f1_gift = f1_score(y_test_class, y_predict_class)

            results = [acc_gift, prec_gift, rec_gift, f1_gift]
            tmp_list.append([df_name_train, df_name_test, focus_name, trait_value] + results + [model_name, representation_name])

        tmp_list.append([df_name_train, df_name_test, focus_name, focus_name] + list(np.mean(np.array(tmp_list)[:, 4:8].astype(float), axis=0)) + [model_name, representation_name])
   
        tmp_tmp_list.append(tmp_list)

Trait: Growth Form
	Dataset: WIKI_ESP
	Dataset: WIKI_DE
Trait: Life Form
	Dataset: WIKI_ESP
	Dataset: WIKI_DE


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
from itertools import chain
unfolded = list(chain.from_iterable(i if isinstance(i, list) else [i] for i in tmp_tmp_list))
df_results = pd.DataFrame(unfolded, columns=["Train Dataset", "Test Dataset", "Trait", "Trait Value", "Accuracy", "Precision", "Recall", "F1-Score", "Model", "Representation"])
df_results

Unnamed: 0,Train Dataset,Test Dataset,Trait,Trait Value,Accuracy,Precision,Recall,F1-Score,Model,Representation
0,WIKI_ESP,WIKI_ESP,Growth Form,herb,0.555,0.866142,0.204461,0.330827,Regex_Simple,Regex_Simple
1,WIKI_ESP,WIKI_ESP,Growth Form,shrub,0.85,0.609023,0.452514,0.519231,Regex_Simple,Regex_Simple
2,WIKI_ESP,WIKI_ESP,Growth Form,tree,0.717,0.0,0.0,0.0,Regex_Simple,Regex_Simple
3,WIKI_ESP,WIKI_ESP,Growth Form,Growth Form,0.707333,0.491721,0.218992,0.283353,Regex_Simple,Regex_Simple
4,WIKI_DE,WIKI_DE,Growth Form,herb,0.444062,0.909091,0.03012,0.058309,Regex_Simple,Regex_Simple
5,WIKI_DE,WIKI_DE,Growth Form,shrub,0.853701,0.607143,0.495146,0.545455,Regex_Simple,Regex_Simple
6,WIKI_DE,WIKI_DE,Growth Form,tree,0.92599,0.939655,0.751724,0.835249,Regex_Simple,Regex_Simple
7,WIKI_DE,WIKI_DE,Growth Form,Growth Form,0.741251,0.81863,0.425663,0.479671,Regex_Simple,Regex_Simple
8,WIKI_ESP,WIKI_ESP,Life Form,phanerophyte,0.64214,0.0,0.0,0.0,Regex_Simple,Regex_Simple
9,WIKI_ESP,WIKI_ESP,Life Form,chamaephyte,0.892977,0.0,0.0,0.0,Regex_Simple,Regex_Simple


### Save Results

In [21]:
df_results.to_excel("Results//Regex_Simple_Results.xlsx", index=False)