**Notebook to preprocess the initial scraped Plants of the World Online and Wikipedia descriptions.**

This is done by removing unrelated information and processing the text using NLP and the following pipeline:
* Remove unnecessary information such as description categories that are not relevant to the task
* Remove artifacts & accents from text
* Detect non-English descriptions and translate to English
* Preprocess by lowercasing and tokenizing, optionally removing numbers, punctuation and stopwords

# Libraries & Functions

In [1]:
'''Math & Data Libraries'''
import numpy as np
import pandas as pd

In [2]:
''' Miscellaneous Libraries'''
from tqdm import tqdm

In [4]:
'''NLP Libraries'''
import nltk
from nltk.corpus import stopwords
import string

from langdetect import detect
from deep_translator import GoogleTranslator, DeeplTranslator, batch_detection

import unidecode
import unicodedata

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## NLP Functions

In [5]:
def remove_accents(text):
    """
    Remove accents from text. 
    ---
    Parameters
    ----------
    text : str
        text from which the accents should be removed

    Returns
    -------
    text_removed_accents : str
        text with removed accents 
    """
    return ''.join(x for x in unicodedata.normalize('NFKD', text) if x in string.ascii_letters or x == " " or x in string.digits or x in string.punctuation)

def remove_artifacts(text):
    """
    Remove artifacts from text. 
    ---
    Parameters
    ----------
    text : str
        text from which artifacts should be removed

    Returns
    -------
    text_removed_artifacts : str
        text with removed artifacts
    """
    return str(text).replace("nbsp", "").replace("<p>", "").replace("</p>", "").replace("<i>", "").replace("</i>", "").replace("<b>", "").replace("</b>", "").replace("&","").replace("_x000D_", "").replace("\"", "").replace("\'", "").replace("<em>", "").replace("</em>", "").replace("<br>", "").replace("</br>", "")

def preprocess_description(text, remove_numbers = True, remove_stopwords = False):
    """
    Preprocess text with the following steps:
        1. Lowercasing
        2. Removal of digits (if remove_numbers = True)
        3. Removal of punctuation (if remove_numbers = True)
        4. Removal of stopwords (if remove_stopwords = True) 
        5. Tokenization and join to fix issues with blank space
    ---
    Parameters
    ----------
    text : str
        text from which artifacts should be removed
    remove_numbers : boolean
        whether to remove or keep digits in the text
    remove_stopwords : boolean
        whether to remove or stopwords in the text

    Returns
    -------
    preprocessed_text : str
        preprocessed_text
    """
    text = text.lower()
    if remove_numbers:
        text = text.translate(str.maketrans('', '', string.digits))
        text = text.translate(str.maketrans('', '', string.punctuation))
       
    tokens = nltk.word_tokenize(text)
    text = " ".join(tokens)

    if remove_numbers:
        tokens_long = []
        for token in tokens:
            if len(token)>1:
                tokens_long.append(token)
        text = ' '.join(tokens_long)

    if remove_stopwords:
        stopwords_ = stopwords.words('english')
        tokens = nltk.word_tokenize(text)
        tokens_stop_words = []
        for token in tokens:
            if token not in stopwords_:
                tokens_stop_words.append(token)
        text = ' '.join(tokens_stop_words)

    return text

In [6]:
def detect_language(text):
    """
    Detect language of the text through the use of term search. 
    ---
    Parameters
    ----------
    text : str
        text from which the accents should be removed

    Returns
    -------
    lang : str
        code for the detected language. One of "en", "es", "fr", "la", "de" 
    """
    for term in latin_terms:
        if(term.lower() in text.lower().translate(str.maketrans('', '', string.punctuation))):
            return "la"
    for term in spanish_terms:
        if(term.lower() in text.lower().translate(str.maketrans('', '', string.punctuation))):
            return "es"
    for term in french_terms:
        if(term.lower() in text.lower().translate(str.maketrans('', '', string.punctuation))):
            return "fr"
    for term in german_terms:
        if(term.lower() in text.lower().translate(str.maketrans('', '', string.punctuation))):
            return "de"
    return "en"

## Variables

In [7]:
"""Terms used to detect non-English descriptions."""
spanish_terms = ["matorrales", "Prados", "bipartidos", "hierba", "arbol", "arbusto", "trepadora", "epifita", "parasita", "acuatica", "solitaria"]
french_terms = ["poussant", " ou "]
german_terms = ["Pflanze", " lang ", "kriechend", "sehr",]
latin_terms = ["contractis", "inflorescentiis", "longitudine", "ceteris", " longis ", "habitu ", "staminibus", "curvato", " ab ", "duplo", " nova ", "haec", " Planta ", "Planta minor", " epiphytica", "ad margines", "varietate", " magna ", "plerumque", "temporaliter", "dormiens", " Herba ", "oblonga", "Internodia"]

# Input Data

## Plants of the World Online

In [2]:
df_POWO = pd.read_excel("..//Data//Initial Databases//POWO_orig_descriptions.xlsx")

In [3]:
df_POWO

Unnamed: 0,POWO_id,description,source,name,authors,i,ID,fqId,created,modified,language,creator
0,morphologyLeaf,Leaves anisophyllous; lamina 12 – 45 × 6 – 15&...,"Kelbessa, E. 2009. Three new species of Acanth...",Acanthopale aethiogermanica,Ensermu,1,77098516-1,urn:lsid:ipni.org:names:77098516-1,,,,
1,conservation,Acanthopaleaethio-germanica is a widespread ta...,"Kelbessa, E. 2009. Three new species of Acanth...",Acanthopale aethiogermanica,Ensermu,1,77098516-1,urn:lsid:ipni.org:names:77098516-1,,,,
2,morphologyReproductiveFruit,"Capsule 14 – 16 × 5.6 – 6.5&nbsp;mm, glabrous,...","Kelbessa, E. 2009. Three new species of Acanth...",Acanthopale aethiogermanica,Ensermu,1,77098516-1,urn:lsid:ipni.org:names:77098516-1,,,,
3,morphologyReproductiveInflorescenceSpikelet,"Spikes axillary, with (1 –) 2 – 3 (– 4) flower...","Kelbessa, E. 2009. Three new species of Acanth...",Acanthopale aethiogermanica,Ensermu,1,77098516-1,urn:lsid:ipni.org:names:77098516-1,,,,
4,morphologyReproductiveFlowerGynoeciumOvary,"Ovary 3.7 – 4.5&nbsp;mm long, partially enclos...","Kelbessa, E. 2009. Three new species of Acanth...",Acanthopale aethiogermanica,Ensermu,1,77098516-1,urn:lsid:ipni.org:names:77098516-1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
412279,morphologyReproductiveFlower,"Flowers white, very fragrant.","Solanaceae, H. heine. Flora of West Tropical A...",Datura candida,Saff.,433345,76934-2,urn:lsid:ipni.org:names:76934-2,,,,
412280,morphologyGeneralHabit,Young branches and leaves rusty; leaves slight...,"Apocynaceae, E.A. Omino. Flora of Tropical Eas...",Beaumontia grandiflora,Wall.,19449,77539-1,urn:lsid:ipni.org:names:77539-1,,,,
412281,distribution,Colombia,"Bernal, R., Gradstein, S.R. & Celis, M. (eds.)...",Puya pastoensis,André,433583,124523-1,urn:lsid:ipni.org:names:124523-1,,,,
412282,morphologyGeneralHabit,"Young plants to 3 m high, with branches at fir...","M. Thulin et al. Flora of Somalia, Vol. 1-4 [u...",Euphorbia robecchii,Pax,129543,348073-1,urn:lsid:ipni.org:names:348073-1,,,,


In [10]:
print("Initial Number of Categories:", df_POWO["POWO_id"].nunique())

Initial Number of Categories: 260


### Filter out unnecessary categories
Conservation, Cytology, Distribution, Figure, Reference, Type, Vernacular, Diagnostic, Ecology

In [11]:
unnecessary_categories = ["conservation", "cytology", "distribution", "figure", "reference", "type", "vernacular", "diagnostic", "ecology"]

In [12]:
category_mask = np.zeros(df_POWO.shape[0], dtype=bool)
for un_cat in unnecessary_categories:
    category_mask += df_POWO["POWO_id"]==un_cat
    print("Category:", un_cat, "Filtered rows:", np.sum(df_POWO["POWO_id"]==un_cat))

print()
print("Total filtered out rows:", np.sum(category_mask))

Category: conservation Filtered rows: 30953
Category: cytology Filtered rows: 59
Category: distribution Filtered rows: 57788
Category: figure Filtered rows: 2871
Category: reference Filtered rows: 9524
Category: type Filtered rows: 3419
Category: vernacular Filtered rows: 5655
Category: diagnostic Filtered rows: 1271
Category: ecology Filtered rows: 12490

Total filtered out rows: 124030


In [14]:
df_POWO = df_POWO[~category_mask]

In [15]:
df_POWO

Unnamed: 0,POWO_id,description,source,name,authors,i,ID,fqId,created,modified,language,creator
0,morphologyLeaf,Leaves anisophyllous; lamina 12 – 45 × 6 – 15&...,"Kelbessa, E. 2009. Three new species of Acanth...",Acanthopale aethiogermanica,Ensermu,1,77098516-1,urn:lsid:ipni.org:names:77098516-1,,,,
2,morphologyReproductiveFruit,"Capsule 14 – 16 × 5.6 – 6.5&nbsp;mm, glabrous,...","Kelbessa, E. 2009. Three new species of Acanth...",Acanthopale aethiogermanica,Ensermu,1,77098516-1,urn:lsid:ipni.org:names:77098516-1,,,,
3,morphologyReproductiveInflorescenceSpikelet,"Spikes axillary, with (1 –) 2 – 3 (– 4) flower...","Kelbessa, E. 2009. Three new species of Acanth...",Acanthopale aethiogermanica,Ensermu,1,77098516-1,urn:lsid:ipni.org:names:77098516-1,,,,
4,morphologyReproductiveFlowerGynoeciumOvary,"Ovary 3.7 – 4.5&nbsp;mm long, partially enclos...","Kelbessa, E. 2009. Three new species of Acanth...",Acanthopale aethiogermanica,Ensermu,1,77098516-1,urn:lsid:ipni.org:names:77098516-1,,,,
5,morphologyReproductiveInflorescenceBract,"Bracts 2, linear to linear-oblanceolate, 8 – 1...","Kelbessa, E. 2009. Three new species of Acanth...",Acanthopale aethiogermanica,Ensermu,1,77098516-1,urn:lsid:ipni.org:names:77098516-1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
412274,morphologyGeneralHabit,Woody-based Thesium-like herb with several ere...,"Rubiaceae, B. Verdcourt. Flora Zambesiaca 5:1....",Manostachya staelioides,(K.Schum.) Bremek.,302653,755812-1,urn:lsid:ipni.org:names:755812-1,,,,
412278,morphologyGeneralHabit,"Woody-stemmed herb, 4 ft. high","Papilionaceae, Hutchinson and Dalziel. Flora o...",Indigofera megacephala,J.B.Gillett,143886,499640-1,urn:lsid:ipni.org:names:499640-1,,,,
412279,morphologyReproductiveFlower,"Flowers white, very fragrant.","Solanaceae, H. heine. Flora of West Tropical A...",Datura candida,Saff.,433345,76934-2,urn:lsid:ipni.org:names:76934-2,,,,
412280,morphologyGeneralHabit,Young branches and leaves rusty; leaves slight...,"Apocynaceae, E.A. Omino. Flora of Tropical Eas...",Beaumontia grandiflora,Wall.,19449,77539-1,urn:lsid:ipni.org:names:77539-1,,,,


## Wikipedia

In [16]:
df_WIKI = pd.read_excel("..//Data//Initial Databases//WIKI_orig_descriptions.xlsx")

In [17]:
df_WIKI

Unnamed: 0,WIKI_id,description,source,name,Date Retrieved,Binomial Name
0,Summary,Aa achalensis is a species of orchid in the ge...,"Schltr., 1920",Aa achalensis,01/07/2022,Aa achalensis
1,Summary,Aa argyrolepis is an orchid in the genus Aa. ...,"Rchb.f., 1854",Aa argyrolepis,01/07/2022,Aa argyrolepis
2,References,"\nReichenbach, H.G. (1854) Xenia Orchidacea 1:...","Rchb.f., 1854",Aa argyrolepis,01/07/2022,Aa argyrolepis
3,Summary,Aa aurantiaca is a species of orchid in the ge...,D. Trujillo (2011)[1],Aa aurantiaca,01/07/2022,Aa aurantiaca
4,Summary,Aa calceata is a species of orchid in the genu...,"Schltr., 1912",Aa calceata,01/07/2022,Aa calceata
...,...,...,...,...,...,...
194989,Distribution,"Native to West Tropical Africa, found in Niger...",(Pax) Mildbr.,Zygotritonia bongensis,09/07/2022,Zygotritonia bongensis
194990,Summary,Zyzyxia is a genus of tropical shrubs in the f...,"(H.Robinson) Strother, 1991",Zyzyxia lundellii,09/07/2022,Zyzyxia lundellii
194991,Description and distribution,Zyzyxia is a shrub that grows to 3 meters tall...,"(H.Robinson) Strother, 1991",Zyzyxia lundellii,09/07/2022,Zyzyxia lundellii
194992,Naming,"Around 1990, John L. Strother was revising the...","(H.Robinson) Strother, 1991",Zyzyxia lundellii,09/07/2022,Zyzyxia lundellii


In [18]:
print("Initial Number of Categories:", df_WIKI["WIKI_id"].nunique())

Initial Number of Categories: 7903


# Preprocessing

## 1. Remove artifacts and accents from text
Such as $<p>, \&nbsp, <i>, <b>, <br>, <em>, _x000D_, ", ', $

In [19]:
df_POWO.loc[:, "prep_description_1"] = df_POWO["description"].apply(remove_artifacts)
df_POWO.loc[:, "prep_description_1"] = df_POWO["prep_description_1"].apply(unidecode.unidecode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_POWO.loc[:, "prep_description_1"] = df_POWO["description"].apply(remove_artifacts)


In [20]:
df_WIKI.loc[:, "prep_description_1"] = df_WIKI["description"].apply(remove_artifacts)
df_WIKI.loc[:, "prep_description_1"] = df_WIKI["prep_description_1"].apply(unidecode.unidecode)

## 2. Translate Descriptions to English
Only done for Plants of the World Online as in the Wikipedia dataset the English language websites are the only ones that were scraped.

### Detect Description Language

In [21]:
df_POWO["Language"] = df_POWO["prep_description_1"].apply(detect_language) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_POWO["Language"] = df_POWO["prep_description_1"].apply(detect_language)


In [22]:
df_POWO["Language"].value_counts()

Language
en    268220
es     19945
la        80
de         7
fr         2
Name: count, dtype: int64

In [23]:
df_WIKI["Language"] = "en" # As we know that all Wikipedia descriptions are in English, we just set the language flag to "en"

## Translate Non-English Descriptions
Keep note that this uses the GoogleTranslator API to translate the detected descriptions to English and thus has a limit on the amount of descriptions that can be translated in a defined time period.

In [27]:
translated_list = []

language_mask = df_POWO["Language"]!="en" # We apply a mask so that we only translate the non-English descriptions
N_non_en = np.sum(language_mask)

# The translations are split into batches of size 100 for more efficient use of the translation
for i in tqdm(range(N_non_en//100 + 1)):
    i_min = 100*i
    i_max = min(100*(i+1), N_non_en)
    values_to_translate = list(df_POWO[language_mask]["prep_description_1"].values[i_min: i_max])
    translations = GoogleTranslator("auto", 'en').translate_batch(values_to_translate)
    translated_list.append(translations)

translations = [element for sublist in translated_list if len(sublist)>0 for element in sublist] # Transform the results from form (N_batches, Batch_size) to (N_descriptions)

df_POWO.loc[:, "translated_prep_description_1"] = df_POWO["prep_description_1"]
df_POWO.loc[language_mask, "translated_prep_description_1"] = translations

In [28]:
df_WIKI["translated_prep_description_1"] = df_WIKI["prep_description_1"] # As we don't do any translations for the English Wikipedia descriptions, we just copy the preprocessed descriptions to have the same data as the POWO dataset

## 3. Remove digits, punctuation, lowercase & tokenize
The final preprocessing pipeline results in three descriptions for the various models. 
- Categorical ML Models (BOW_description): Used for the bag of words logistic regression model, thus we apply the base preprocessing and remove stop words and digits.
- Categorical DL Models (BERT_description): Used for the sequence classification transformer models, thus we apply the base preprocessing and remove digits from the text.
- Numerical DL Models (QA_description): Used for question answering transformer models, thus we only apply the base preprocessing.

In [29]:
df_POWO.loc[:, "QA_description"] = df_POWO["translated_prep_description_1"].apply(lambda x: preprocess_description(x, False, False))
df_POWO.loc[:, "BERT_description"] = df_POWO["translated_prep_description_1"].apply(lambda x: preprocess_description(x, True, False))
df_POWO.loc[:, "BOW_description"] = df_POWO["translated_prep_description_1"].apply(lambda x: preprocess_description(x, True, True))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_POWO.loc[:, "QA_description"] = df_POWO["translated_prep_description_1"].apply(lambda x: preprocess_description(x, False, False))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_POWO.loc[:, "BERT_description"] = df_POWO["translated_prep_description_1"].apply(lambda x: preprocess_description(x, True, False))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/sta

In [30]:
df_WIKI.loc[:, "QA_description"] = df_WIKI["translated_prep_description_1"].apply(lambda x: preprocess_description(x, False, False))
df_WIKI.loc[:, "BERT_description"] = df_WIKI["translated_prep_description_1"].apply(lambda x: preprocess_description(x, True, False))
df_WIKI.loc[:, "BOW_description"] = df_WIKI["translated_prep_description_1"].apply(lambda x: preprocess_description(x, True, True))

# Save Data

In [642]:
df_POWO.to_excel("..//Data//Preprocessed Databases//POWO_preprocessed_descriptions.xlsx", index = False)
df_WIKI.to_excel("..//Data//Preprocessed Databases//WIKI_preprocessed_descriptions.xlsx", index = False)