# Libraries & Functions

In [1]:
'''Math & Data Libraries'''
import numpy as np
import pandas as pd

In [13]:
'''ML Libraries'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
model_name = "Logistic Regression"
representation_name = "Bag of Words"

In [4]:
def calculate_scores(y_test, y_pred, average = "binary"):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    return [accuracy, precision, recall, f1]

In [5]:
trait_values_dict = {}
trait_values_dict["1.2.1"] = ["herb", "shrub", "tree"]
# trait_values_dict["1.3.1"] = ["obligatory", "terrestrial"]
# trait_values_dict["1.4.1"] = ["obligatory", "self-supporting"]
# trait_values_dict["2.1.1"] = ["annual", "perennial"]
trait_values_dict["2.3.1"] = ["phanerophyte", "chamaephyte", "hemicryptophyte", "cryptophyte", "therophyte"]

trait_names_cat = ["Growth Form", "Life Form"] #"Epiphyte", "Climber", "Lifecycle", 
traits_cat = ["1.2.1", "2.3.1"] #"1.3.1", "1.4.1", "2.1.1", 

In [11]:
focus_names = ["Growth Form", "Life Form"]
focus_codes = ["1.2.1", "2.3.1"]

# Input Data

In [6]:
raw_datasets = dict()

## POWO Dataset

In [8]:
working_dir = "..//Datasets//" 

df_POWO_Cat =  pd.read_excel(working_dir + "POWO_GIFT.xlsx")
df_POWO_Cat_Preproc = df_POWO_Cat.drop_duplicates(subset = ["BERT_description"])
df_POWO_Cat_Preproc = df_POWO_Cat_Preproc[df_POWO_Cat_Preproc["BERT_description"].apply(lambda x: len(x.split(" ")))>10]
raw_datasets["POWO"] = df_POWO_Cat_Preproc

## WIKI Dataset

In [9]:
def fix_WIKI(name, description):
    for n in name.split(" "):
        description = str(description).replace(n.lower(), "")
    return description.strip()

In [10]:
working_dir = "..//Datasets//" 

df_WIKI_Cat =  pd.read_excel(working_dir + "WIKI_GIFT.xlsx")
df_WIKI_Cat_Preproc = df_WIKI_Cat.drop_duplicates(subset = ["BERT_description"])
df_WIKI_Cat_Preproc["BERT_description"] = df_WIKI_Cat_Preproc[["name", "BERT_description"]].apply(lambda x: fix_WIKI(x[0], x[1]), axis = 1)
df_WIKI_Cat_Preproc = df_WIKI_Cat_Preproc[df_WIKI_Cat_Preproc["BERT_description"].apply(lambda x: len(str(x).split(" ")))>10]
raw_datasets["WIKI"] = df_WIKI_Cat_Preproc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_WIKI_Cat_Preproc["BERT_description"] = df_WIKI_Cat_Preproc[["name", "BERT_description"]].apply(lambda x: fix_WIKI(x[0], x[1]), axis = 1)


## Preprocess Datasets

In [12]:
label_map = {
    "Growth Form": {"herb": 0, "shrub": 1, "tree": 2},
    "Life Form": {"phanerophyte": 0, "chamaephyte": 1, "hemicryptophyte": 2, "cryptophyte": 3, "therophyte": 4},
}


In [21]:
preprocessed_dataset_dict = {}
sample_size = 5000
for focus_name, focus_code in zip(focus_names, focus_codes):
    for dataset_name in list(raw_datasets.keys()):
        labelencoder = LabelEncoder()
        count_vect = CountVectorizer(max_features = 1000)

        dataset_masked = raw_datasets[dataset_name][raw_datasets[dataset_name][focus_code].notna()]
        dataset_masked = dataset_masked[dataset_masked[focus_code].apply(lambda x: x in label_map[focus_name].keys())].sample(sample_size)
        dataset_masked[focus_code + "_encoded"] = labelencoder.fit_transform(dataset_masked[focus_code])

        indices_train, indices_test \
            = train_test_split(dataset_masked.index.values, test_size=0.25, random_state=42)
            
        df_train = dataset_masked.loc[indices_train, ["BERT_description", focus_code + "_encoded"]]
        df_train.columns = ["text", "labels"]
        df_test = dataset_masked.loc[indices_test, ["BERT_description", focus_code + "_encoded"]]
        df_test.columns = ["text", "labels"]
        
        count_vect.fit(df_train["text"].values)

        preprocessed_dataset_dict[dataset_name, focus_name, "train_countVect"] = count_vect.transform(df_train["text"].values).toarray()
        preprocessed_dataset_dict[dataset_name, focus_name, "validation_countVect"] = count_vect.transform(df_test["text"].values).toarray()

        preprocessed_dataset_dict[dataset_name, focus_name, "train"] = df_train
        preprocessed_dataset_dict[dataset_name, focus_name, "validation"] = df_test

# Model Training & Evaluation

## Logistic Regression Model

In [26]:
result_list = []
for dataset_name in list(raw_datasets.keys()):
  for focus_name, focus_code in zip(focus_names, focus_codes):
    print("Dataset:", dataset_name)
    print("\tTrait:", focus_name)

    model = LogisticRegression(max_iter=1000)

    model.fit(preprocessed_dataset_dict[dataset_name, focus_name, "train_countVect"], preprocessed_dataset_dict[dataset_name, focus_name, "train"]["labels"])    
    y_predict = model.predict(preprocessed_dataset_dict[dataset_name, focus_name, "validation_countVect"])

    results = calculate_scores(preprocessed_dataset_dict[dataset_name, focus_name, "validation"]["labels"], y_predict, average = "macro")

    result_list.append([dataset_name, dataset_name, focus_name, focus_name] + results + [model_name, representation_name])
    
    for i, trait_value in enumerate(trait_values_dict[focus_code]):
      print("\t\tTrait Value:", trait_value, i)
      
      y_predict_class = np.zeros(y_predict.shape)
      y_predict_class[y_predict==i] = 1
      
      y_test_class = np.zeros(y_predict.shape)
      y_test_class[preprocessed_dataset_dict[dataset_name, focus_name, "validation"]["labels"]==i] = 1
      
      results = calculate_scores(y_test_class, y_predict_class, average = "binary")

      result_list.append([dataset_name, dataset_name, focus_name, trait_value] + results + [model_name, representation_name])

df_results = pd.DataFrame(result_list, columns=["Train Dataset", "Test Dataset", "Trait", "Trait Value", "Accuracy", "Precision", "Recall", "F1-Score", "Model", "Representation"])


Dataset: POWO
	Trait: Growth Form
		Trait Value: herb 0
		Trait Value: shrub 1
		Trait Value: tree 2
Dataset: POWO
	Trait: Life Form
		Trait Value: phanerophyte 0
		Trait Value: chamaephyte 1
		Trait Value: hemicryptophyte 2
		Trait Value: cryptophyte 3
		Trait Value: therophyte 4
Dataset: WIKI
	Trait: Growth Form
		Trait Value: herb 0
		Trait Value: shrub 1
		Trait Value: tree 2
Dataset: WIKI
	Trait: Life Form
		Trait Value: phanerophyte 0
		Trait Value: chamaephyte 1
		Trait Value: hemicryptophyte 2
		Trait Value: cryptophyte 3
		Trait Value: therophyte 4


In [27]:
df_results

Unnamed: 0,Train Dataset,Test Dataset,Trait,Trait Value,Accuracy,Precision,Recall,F1-Score,Model,Representation
0,POWO,POWO,Growth Form,Growth Form,0.884,0.795085,0.790811,0.792755,Logistic Regression,Bag of Words
1,POWO,POWO,Growth Form,herb,0.9176,0.94354,0.944565,0.944052,Logistic Regression,Bag of Words
2,POWO,POWO,Growth Form,shrub,0.8944,0.591716,0.613497,0.60241,Logistic Regression,Bag of Words
3,POWO,POWO,Growth Form,tree,0.956,0.85,0.814371,0.831804,Logistic Regression,Bag of Words
4,POWO,POWO,Life Form,Life Form,0.8664,0.828133,0.812781,0.819388,Logistic Regression,Bag of Words
5,POWO,POWO,Life Form,phanerophyte,0.9536,0.651685,0.682353,0.666667,Logistic Regression,Bag of Words
6,POWO,POWO,Life Form,chamaephyte,0.9496,0.806723,0.705882,0.752941,Logistic Regression,Bag of Words
7,POWO,POWO,Life Form,hemicryptophyte,0.9232,0.883549,0.935421,0.908745,Logistic Regression,Bag of Words
8,POWO,POWO,Life Form,cryptophyte,0.9656,0.91954,0.916031,0.917782,Logistic Regression,Bag of Words
9,POWO,POWO,Life Form,therophyte,0.9408,0.879167,0.824219,0.850806,Logistic Regression,Bag of Words


### Save Results

In [28]:
df_results.to_excel("Results//LR_Bow_Results.xlsx", index=False)