# Libraries & Functions

In [1]:
'''Math & Data Libraries'''
import numpy as np
import pandas as pd

In [2]:
'''ML Libraries'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
model_name = "Logistic Regression"
representation_name = "Bag of Words"

In [4]:
def calculate_scores(y_test, y_pred, average = "binary"):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    return [accuracy, precision, recall, f1]

In [5]:
trait_values_dict = {}
trait_values_dict["1.2.1"] = ["herb", "shrub", "tree"]
# trait_values_dict["1.3.1"] = ["obligatory", "terrestrial"]
# trait_values_dict["1.4.1"] = ["obligatory", "self-supporting"]
# trait_values_dict["2.1.1"] = ["annual", "perennial"]
trait_values_dict["2.3.1"] = ["phanerophyte", "chamaephyte", "hemicryptophyte", "cryptophyte", "therophyte"]

trait_names_cat = ["Growth Form", "Life Form"] #"Epiphyte", "Climber", "Lifecycle", 
traits_cat = ["1.2.1", "2.3.1"] #"1.3.1", "1.4.1", "2.1.1", 

# Input Data

## Spanish Wikipedia - WIKI_GIFT_ESP Dataset

In [6]:
df_WIKI_GIFT_ESP = pd.read_excel("../Datasets//WIKI_GIFT_ESP.xlsx")
df_WIKI_GIFT_ESP = df_WIKI_GIFT_ESP[~df_WIKI_GIFT_ESP["BOW_description"].isna()]

## German Wikipedia - WIKI_GIFT_DE Dataset

In [7]:
df_WIKI_GIFT_DE = pd.read_excel("../Datasets//WIKI_GIFT_DE.xlsx")
df_WIKI_GIFT_DE = df_WIKI_GIFT_DE[~df_WIKI_GIFT_DE["BOW_description"].isna()]

# Model Training & Evaluation

## Text Representation - Bag of Words

In [8]:
countVect_datasets = dict()
for df_name, df in zip(["WIKI_ESP", "WIKI_DE"], [df_WIKI_GIFT_ESP, df_WIKI_GIFT_DE]): 
    corpus = df["BOW_description"].values
    count_vect = CountVectorizer(max_features = 1000)
    count_vect.fit(corpus)
    countVect_datasets[df_name] = count_vect.transform(corpus).toarray()

## Split Data

In [9]:
X_train = {}
X_test = {}

y_train = {}
y_test = {}

for df_name, df in zip(["WIKI_ESP", "WIKI_DE"], [df_WIKI_GIFT_ESP, df_WIKI_GIFT_DE]): 
  for focus_name, focus_code in zip(trait_names_cat, traits_cat):
      
    trait_mask = df[focus_code].notna()
    # y = {trait: 1*df[mask_dict[df_name, focus_name]][focus_code].apply(lambda x: trait in x) for trait in trait_values_dict[focus_code]}
    y = {trait: 1*df[trait_mask][focus_code].apply(lambda x: trait == x) for trait in trait_values_dict[focus_code]}

    X_train[df_name, focus_name], X_test[df_name, focus_name], \
    indices_train, indices_test \
    = train_test_split(countVect_datasets[df_name][trait_mask], 
                      np.arange(sum(trait_mask)), test_size=0.25, random_state=42)

    y_train[df_name, focus_name, focus_name] = np.zeros(len(X_train[df_name, focus_name]), dtype=int)
    y_test[df_name, focus_name, focus_name] = np.zeros(len(X_test[df_name, focus_name]), dtype=int)

    for i, trait_value in enumerate(trait_values_dict[focus_code]):
      # print(i, trait_value)
      y_train[df_name, focus_name, trait_value] = y[trait_value].values[indices_train]
      y_test[df_name, focus_name, trait_value] = y[trait_value].values[indices_test]

      y_train[df_name, focus_name, focus_name] += y_train[df_name, focus_name, trait_value] * i
      y_test[df_name, focus_name, focus_name] += y_test[df_name, focus_name, trait_value] * i

## Logistic Regression Model

In [10]:
result_list = []
for df_name, df in zip(["WIKI_ESP", "WIKI_DE"], [df_WIKI_GIFT_ESP, df_WIKI_GIFT_DE]): 
  for focus_name, focus_code in zip(trait_names_cat, traits_cat):
    print("Dataset:", df_name)
    print("\tTrait:", focus_name)

    model = LogisticRegression(max_iter=1000)

    model.fit(X_train[df_name, focus_name], y_train[df_name, focus_name, focus_name])    
    y_predict = model.predict(X_test[df_name, focus_name])

    results = calculate_scores(y_test[df_name, focus_name, focus_name], y_predict, average = "macro")

    result_list.append([df_name, df_name, focus_name, focus_name] + results + [model_name, representation_name])
    
    for i, trait_value in enumerate(trait_values_dict[focus_code]):
      print("\t\tTrait Value:", trait_value, i)
      
      y_predict_class = np.zeros(y_predict.shape)
      y_predict_class[y_predict==i] = 1
      
      y_test_class = np.zeros(y_predict.shape)
      y_test_class[y_test[df_name, focus_name, focus_name]==i] = 1
      
      results = calculate_scores(y_test[df_name, focus_name, trait_value], y_predict_class, average = "binary")

      result_list.append([df_name, df_name, focus_name, trait_value] + results + [model_name, representation_name])

df_results = pd.DataFrame(result_list, columns=["Train Dataset", "Test Dataset", "Trait", "Trait Value", "Accuracy", "Precision", "Recall", "F1-Score", "Model", "Representation"])


Dataset: WIKI_ESP
	Trait: Growth Form
		Trait Value: herb 0
		Trait Value: shrub 1
		Trait Value: tree 2
Dataset: WIKI_ESP
	Trait: Life Form
		Trait Value: phanerophyte 0
		Trait Value: chamaephyte 1
		Trait Value: hemicryptophyte 2
		Trait Value: cryptophyte 3
		Trait Value: therophyte 4
Dataset: WIKI_DE
	Trait: Growth Form
		Trait Value: herb 0
		Trait Value: shrub 1
		Trait Value: tree 2
Dataset: WIKI_DE
	Trait: Life Form
		Trait Value: phanerophyte 0
		Trait Value: chamaephyte 1
		Trait Value: hemicryptophyte 2
		Trait Value: cryptophyte 3
		Trait Value: therophyte 4


In [11]:
df_results

Unnamed: 0,Train Dataset,Test Dataset,Trait,Trait Value,Accuracy,Precision,Recall,F1-Score,Model,Representation
0,WIKI_ESP,WIKI_ESP,Growth Form,Growth Form,0.807,0.756534,0.760734,0.758564,Logistic Regression,Bag of Words
1,WIKI_ESP,WIKI_ESP,Growth Form,herb,0.863,0.879017,0.864312,0.871603,Logistic Regression,Bag of Words
2,WIKI_ESP,WIKI_ESP,Growth Form,shrub,0.841,0.554348,0.569832,0.561983,Logistic Regression,Bag of Words
3,WIKI_ESP,WIKI_ESP,Growth Form,tree,0.91,0.836237,0.848057,0.842105,Logistic Regression,Bag of Words
4,WIKI_ESP,WIKI_ESP,Life Form,Life Form,0.673913,0.622649,0.610743,0.615316,Logistic Regression,Bag of Words
5,WIKI_ESP,WIKI_ESP,Life Form,phanerophyte,0.876254,0.794118,0.883178,0.836283,Logistic Regression,Bag of Words
6,WIKI_ESP,WIKI_ESP,Life Form,chamaephyte,0.879599,0.428571,0.375,0.4,Logistic Regression,Bag of Words
7,WIKI_ESP,WIKI_ESP,Life Form,hemicryptophyte,0.824415,0.580357,0.528455,0.553191,Logistic Regression,Bag of Words
8,WIKI_ESP,WIKI_ESP,Life Form,cryptophyte,0.899666,0.682927,0.622222,0.651163,Logistic Regression,Bag of Words
9,WIKI_ESP,WIKI_ESP,Life Form,therophyte,0.867893,0.627273,0.64486,0.635945,Logistic Regression,Bag of Words


### Save Results

In [12]:
df_results.to_excel("Results//LR_Bow_Results.xlsx", index=False)