# Libraries & Functions

In [1]:
'''Math & Data Libraries'''
import numpy as np
import pandas as pd

In [2]:
'''ML Libraries'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
model_name = "Logistic Regression"
representation_name = "Bag of Words"

In [4]:
def calculate_scores(y_test, y_pred, average = "binary"):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    return [accuracy, precision, recall, f1]

In [16]:
trait_values_dict = {}
trait_values_dict["1.2.1"] = ["herb", "shrub", "tree"]
trait_values_dict["1.3.1"] = ["obligatory", "terrestrial"]
trait_values_dict["1.4.1"] = ["obligatory", "self-supporting"]
trait_values_dict["2.1.1"] = ["annual", "perennial"]
trait_values_dict["2.3.1"] = ["phanerophyte", "chamaephyte", "hemicryptophyte", "cryptophyte", "therophyte"]

trait_names_cat = ["Growth Form", "Epiphyte", "Climber", "Lifecycle", "Life Form"]
traits_cat = ["1.2.1", "1.3.1", "1.4.1", "2.1.1", "2.3.1"]

# Input Data

## Plants of the World Online - POWO GIFT

In [17]:
df_POWO_GIFT = pd.read_excel("..//Data//Final Databases//POWO_GIFT.xlsx")

## Wikipedia - WIKI GIFT

In [18]:
df_WIKI_GIFT = pd.read_excel("..//Data//Final Databases//WIKI_GIFT.xlsx")
df_WIKI_GIFT = df_WIKI_GIFT[~df_WIKI_GIFT["BOW_description"].isna()]

# Model Training & Evaluation

## Text Representation - Bag of Words

In [19]:
countVect_datasets = dict()
for df_name, df in zip(["POWO", "WIKI"], [df_POWO_GIFT, df_WIKI_GIFT]): 
    corpus = df["BOW_description"].values
    count_vect = CountVectorizer(max_features = 1000)
    count_vect.fit(corpus)

    countVect_datasets[df_name] = count_vect.transform(corpus).toarray()

## Split Data

In [20]:
X_train = {}
X_test = {}

y_train = {}
y_test = {}

for df_name, df in zip(["POWO", "WIKI"], [df_POWO_GIFT, df_WIKI_GIFT]): 
  for focus_name, focus_code in zip(trait_names_cat, traits_cat):
      
    trait_mask = df[focus_code].notna()
    # y = {trait: 1*df[mask_dict[df_name, focus_name]][focus_code].apply(lambda x: trait in x) for trait in trait_values_dict[focus_code]}
    y = {trait: 1*df[trait_mask][focus_code].apply(lambda x: trait == x) for trait in trait_values_dict[focus_code]}

    X_train[df_name, focus_name], X_test[df_name, focus_name], \
    indices_train, indices_test \
    = train_test_split(countVect_datasets[df_name][trait_mask], 
                      np.arange(sum(trait_mask)), test_size=0.25, random_state=42)

    y_train[df_name, focus_name, focus_name] = np.zeros(len(X_train[df_name, focus_name]), dtype=int)
    y_test[df_name, focus_name, focus_name] = np.zeros(len(X_test[df_name, focus_name]), dtype=int)

    for i, trait_value in enumerate(trait_values_dict[focus_code]):
      # print(i, trait_value)
      y_train[df_name, focus_name, trait_value] = y[trait_value].values[indices_train]
      y_test[df_name, focus_name, trait_value] = y[trait_value].values[indices_test]

      y_train[df_name, focus_name, focus_name] += y_train[df_name, focus_name, trait_value] * i
      y_test[df_name, focus_name, focus_name] += y_test[df_name, focus_name, trait_value] * i

## Logistic Regression Model

In [21]:
result_list = []
for df_name, df in zip(["POWO", "WIKI"], [df_POWO_GIFT, df_WIKI_GIFT]): 
  for focus_name, focus_code in zip(trait_names_cat, traits_cat):
    print("Dataset:", df_name)
    print("\tTrait:", focus_name)

    model = LogisticRegression(max_iter=1000)

    model.fit(X_train[df_name, focus_name], y_train[df_name, focus_name, focus_name])    
    y_predict = model.predict(X_test[df_name, focus_name])

    results = calculate_scores(y_test[df_name, focus_name, focus_name], y_predict, average = "macro")

    result_list.append([df_name, df_name, focus_name, focus_name] + results + [model_name, representation_name])
    
    for i, trait_value in enumerate(trait_values_dict[focus_code]):
      print("\t\tTrait Value:", trait_value, i)
      
      y_predict_class = np.zeros(y_predict.shape)
      y_predict_class[y_predict==i] = 1
      
      y_test_class = np.zeros(y_predict.shape)
      y_test_class[y_test[df_name, focus_name, focus_name]==i] = 1
      
      results = calculate_scores(y_test[df_name, focus_name, trait_value], y_predict_class, average = "binary")

      result_list.append([df_name, df_name, focus_name, trait_value] + results + [model_name, representation_name])

df_results = pd.DataFrame(result_list, columns=["Train Dataset", "Test Dataset", "Trait", "Trait Value", "Accuracy", "Precision", "Recall", "F1-Score", "Model", "Representation"])


Dataset: POWO
	Trait: Growth Form
		Trait Value: herb 0
		Trait Value: shrub 1
		Trait Value: tree 2
Dataset: POWO
	Trait: Epiphyte
		Trait Value: obligatory 0
		Trait Value: terrestrial 1
Dataset: POWO
	Trait: Climber
		Trait Value: obligatory 0
		Trait Value: self-supporting 1
Dataset: POWO
	Trait: Lifecycle
		Trait Value: annual 0
		Trait Value: perennial 1
Dataset: POWO
	Trait: Life Form
		Trait Value: phanerophyte 0
		Trait Value: chamaephyte 1
		Trait Value: hemicryptophyte 2
		Trait Value: cryptophyte 3
		Trait Value: therophyte 4
Dataset: WIKI
	Trait: Growth Form
		Trait Value: herb 0
		Trait Value: shrub 1
		Trait Value: tree 2
Dataset: WIKI
	Trait: Epiphyte
		Trait Value: obligatory 0
		Trait Value: terrestrial 1
Dataset: WIKI
	Trait: Climber
		Trait Value: obligatory 0
		Trait Value: self-supporting 1
Dataset: WIKI
	Trait: Lifecycle
		Trait Value: annual 0
		Trait Value: perennial 1
Dataset: WIKI
	Trait: Life Form
		Trait Value: phanerophyte 0
		Trait Value: chamaephyte 1
		

In [22]:
df_results

Unnamed: 0,Train Dataset,Test Dataset,Trait,Trait Value,Accuracy,Precision,Recall,F1-Score,Model,Representation
0,POWO,POWO,Growth Form,Growth Form,0.905881,0.870886,0.871747,0.870791,Logistic Regression,Bag of Words
1,POWO,POWO,Growth Form,herb,0.942835,0.951145,0.952673,0.951908,Logistic Regression,Bag of Words
2,POWO,POWO,Growth Form,shrub,0.9123,0.731002,0.783217,0.756209,Logistic Regression,Bag of Words
3,POWO,POWO,Growth Form,tree,0.956974,0.929929,0.880435,0.904505,Logistic Regression,Bag of Words
4,POWO,POWO,Epiphyte,Epiphyte,0.968663,0.931377,0.900446,0.915151,Logistic Regression,Bag of Words
5,POWO,POWO,Epiphyte,obligatory,0.968032,0.834703,0.840074,0.83738,Logistic Regression,Bag of Words
6,POWO,POWO,Epiphyte,terrestrial,0.968663,0.977822,0.987291,0.982534,Logistic Regression,Bag of Words
7,POWO,POWO,Climber,Climber,0.952464,0.886591,0.831004,0.856012,Logistic Regression,Bag of Words
8,POWO,POWO,Climber,obligatory,0.953692,0.801268,0.690346,0.741683,Logistic Regression,Bag of Words
9,POWO,POWO,Climber,self-supporting,0.952464,0.96557,0.982292,0.973859,Logistic Regression,Bag of Words


### Save Results

In [23]:
df_results.to_excel("..//Data//Results//LR_Bow_Results.xlsx", index=False)