In [1]:
import pandas as pd
import numpy as np
import collections
import json
import re
from tqdm import tqdm
import pickle

tqdm.pandas()

In [2]:
root = "../../../data/interim/"

df_bert = pd.read_csv(root + "DF_GoogleForms_BERT_AllData.csv", 
                        index_col=0)
df_jacc = pd.read_csv(root + "DF_GoogleForms_Jaccard_AllData.csv",
                        index_col=0)

df_jacc["SIM"] = "Jacc"
df_bert["SIM"] = "Bert"

In [3]:
root = "../../../data/processed/"

f = open(F"{root}Sentences_Pierre.pkl", 'rb')
sentences_Pierre = pickle.load(f)
species_Pierre =  sentences_Pierre.keys()

f = open(F"{root}Sentences_Andrei.pkl", 'rb')
sentences_Andrei = pickle.load(f)
species_Andrei = sentences_Andrei.keys() 

f = open(F"{root}Sentences_Kissling.pkl", 'rb')
sentences_Kissling = pickle.load(f)
species_Kissling = sentences_Kissling.keys() 


In [4]:
def datalist(species):

    name = ""
    if species in species_Pierre:
        name = "PlantNet"
    elif species in species_Andrei:
        name = "Caribbean"
    elif species in species_Kissling:
        name = "Palm"

    return name

def ascii_converter(text):

    if type(text) != str:
        return text

    patterns = [
        (r"â\x80\x93", " to "),
        (r"\xa0", " "),
        (r"Â", ""),
        (r"\s,", ","),  
        (r"\s+", " "),
        (r"\s:", ":"),
        (r"\sÃ\x97", ","),
        (r"â\x80\x94", " "),
        (r"Â", "")
    ]
    for (pattern, replacement) in patterns:
        text = re.sub(pattern, replacement, text)

    text = text.lstrip(".")
    
    return text

In [5]:
df = pd.concat(
    [df_bert, df_jacc]
)

# Datalist species
df["Dataset"] = df["Species"].apply(datalist)

# Note to self: Could be done using Pandas
columns = [
    "1", "2", "3", "4", "5"
]

# Regexes
for column in columns:
    df.loc[:, column] = df[column].apply(ascii_converter)

In [6]:
df.loc[8699, "2"]

8699    Fruit covering: fleshy, 5-winged capsule.
8699                        Fruit length: ½ inch.
Name: 2, dtype: object

In [13]:
# CARIBBEAN
df_Caribbean = df[df["Dataset"] == "Caribbean"]
JACC_Thresh = df_Caribbean.describe().loc["75%", "Jaccard Sum"]
BERT_Thresh = df_Caribbean.describe().loc["75%", "Sim Sum"]
df_Caribbean_bert_sample = df_Caribbean[df_Caribbean["Sim Sum"] > BERT_Thresh]
df_Caribbean_Jacc_sample = df_Caribbean[df_Caribbean["Jaccard Sum"] > JACC_Thresh]
df_Caribbean_75 = pd.concat(
    [df_Caribbean_bert_sample, df_Caribbean_Jacc_sample]
)

# PALMS
df_Palm = df[df["Dataset"] == "Palm"]
JACC_Thresh = df_Palm.describe().loc["75%", "Jaccard Sum"]
BERT_Thresh = df_Palm.describe().loc["75%", "Sim Sum"]
df_Palm_bert_sample = df_Palm[df_Palm["Sim Sum"] > BERT_Thresh]
df_Palm_Jacc_sample = df_Palm[df_Palm["Jaccard Sum"] > JACC_Thresh]
df_Palm_75 = pd.concat(
    [df_Palm_bert_sample, df_Palm_Jacc_sample]
)

# PLANTNET
df_PlantNet = df[df["Dataset"] == "PlantNet"]
JACC_Thresh = df_PlantNet.describe().loc["75%", "Jaccard Sum"]
BERT_Thresh = df_PlantNet.describe().loc["75%", "Sim Sum"]
df_PlantNet_bert_sample = df_PlantNet[df_PlantNet["Sim Sum"] > BERT_Thresh]
df_PlantNet_Jacc_sample = df_PlantNet[df_PlantNet["Jaccard Sum"] > JACC_Thresh]
df_PlantNet_75 = pd.concat(
    [df_PlantNet_bert_sample, df_PlantNet_Jacc_sample]
)


name = "Dataset"
# name = "top_sents_all_AllSentencesAgainstTrait_Random20Subset"
folder = "../../../data/processed/"
for i in range(1, 11):
    df_sample = df_Caribbean_75.sample(n=20, 
                            random_state=333 + i, 
                            replace=False)

    df_sample.to_csv(f"{folder}{name}_Caribbean_{i}.csv", sep="\t")

    df_sample = df_Palm_75.sample(n=20, 
                        random_state=333 + i, 
                        replace=False)

    df_sample.to_csv(f"{folder}{name}_Palm_{i}.csv", sep="\t")

    df_sample = df_PlantNet_75.sample(n=20, 
                        random_state=333 + i, 
                        replace=False)

    df_sample.to_csv(f"{folder}{name}_PlantNet_{i}.csv", sep="\t")


In [None]:
# JACC_Thresh = df_jacc.describe().loc["75%", "Jaccard Sum"]
# BERT_Thresh = df_bert.describe().loc["75%", "Sim Sum"]

# df_bert_sample = df_bert[df_bert["Sim Sum"] > BERT_Thresh]\
#     # .sample(n=20, replace=False, random_state=333)

# df_jacc_sample = df_jacc[df_jacc["Jaccard Sum"] > JACC_Thresh]\
#     # .sample(n=20, replace=False, random_state=333)

# df = pd.concat(
#     [df_bert_sample, df_jacc_sample]
# )

# folder = "../../../data/processed/"

# df.sample(n=20, random_state=123, replace=False)\
#     .to_csv(f"{folder}top_sents_all_AllSentencesAgainstTrait_Random20Subset_demo.csv", sep="\t")

# for i in range(1, 11):
#     df_sample = df.sample(n=20, 
#                         random_state=333 + i, 
#                         replace=False)

#     df_sample.to_csv(f"{folder}top_sents_all_AllSentencesAgainstTrait_Random20Subset_{i}.csv", sep="\t")

In [9]:
df_sample

Unnamed: 0,Species,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5,GoogleSentence,Sim Sum,SIM,Jaccard Sum,Dataset
3577,Ficus vogelii,crown,crown,"['Regular density of the crown', 'Leaves group...",The stipule of Ficus religiosa.,"The leaves of the plants are simple, entire or...",All plant parts of species belonging to the ge...,The male flowers have a perianth consisting of...,Flowers are found on the same or different pla...,Crown: regular density of the crown,,Jacc,0.027154,PlantNet
7441,Vernonia amygdalina,petiole,petiole,['Petiolated leaf'],Leaves with petioles 0.2 to 1.5 cm.,"Morphology Capitula very numerous in terminal,...","Leaves alternate, simple, stipules absent, pet...","Inflorescence a head, arranged in terminal, co...","Leaves petiolate, elliptic, lanceolate or ovat...",Petiole: petiolated leaf,0.919212,Bert,,PlantNet
4179,Hexalobus monopetalus,vein,vein,['Pinnate venation with secondary veins connec...,The leaves have 6-15 pairs of secondary veins ...,"Leaves alternate, held in one plane, oblong-el...","Petals yellow or cream, thin, subequal, the in...",The margins of the inner petals are curved bac...,"Sepals ovate-elliptic, 4 to 6 mm.",Vein: pinnate venation with secondary veins co...,0.888415,Bert,,PlantNet
271,Acacia mellifera,bark,bark,"['Smooth bark', 'Finely fissured bark']",Bark light to dark grey and longitudinally fis...,The bark is purplish-black on the young stems ...,The bark varies from light to dark grey and is...,,,Bark: smooth bark,,Jacc,0.035969,PlantNet
6930,Tamarindus indica,trunk and root,trunk and root,['Base of trunk straight'],Straight trunk with pendulous branches.,The seed consists of 30% hard brown seed coat ...,A graceful ornamental tree with racemes of pea...,Leaves are alternate and paripinnate-each with...,"Of the five petals, the posterior and lateral ...",Trunk and root: base of trunk straight,,Jacc,0.023115,PlantNet
10,Acacia amythethophylla,leaf margin,leaf margin,['Smooth leaf margin'],"Stipules spinescent, stout, brown, glossy, com...","Pinnae sometimes 20 or more pairs, with 50 or ...","Leaves large, pinnae 7-14 pairs.","Leaves large, 10 to 20 cm.","Leaves large, 10-20 cm.",Leaf margin: smooth leaf margin,0.889487,Bert,,PlantNet
7292,Tinospora bakis,inflorescences,inflorescences,['Rootlet or cluster of flowers or inflorescen...,"Inflorescence an axillary, elongate false race...",Twiner with perennial rootstock.,Liane with branchlets glabrous and verrucose w...,Female flowers in 7 to 12 cm long inflorescenc...,"Woody climber, branches glabrous, warty with l...",Inflorescences: rootlet or cluster of flowers ...,0.919065,Bert,,PlantNet
7305,Trema orientalis,petiole,petiole,['Petiolated leaf'],"Stamens 5, stigmas villous, reflexed in fruits.","The leaves are simple, alternate, stipulate al...","Axillary, pubescent fascicled cymes, sometimes...","Leaves: Ovate-lanceolate, acuminate at apex, r...",Seeds minute with membranous testa.,Petiole: petiolated leaf,0.910026,Bert,,PlantNet
2217,Cordia africana,leaf blade,leaf blade,"['lanceolate blade', 'elliptical blade', 'asym...",The leaf undersides are hairy brown.,The leaf margin is notched smooth or only slig...,"The simple, stiff leathery leaf blades are ova...",,,Leaf blade: lanceolate blade,,Jacc,0.026635,PlantNet
317,Acacia polyacantha ssp. campylacantha,leaf shape,leaf shape,['Bipinnate leaf'],"Leaves: Paripinnate, alternate, axis 4-20 cm l...","Flowers: Bisexual, 4 mm across, white, in leaf...","Fruit/seeds: Pod, stipitate, 8 x 1.5 cm, hairl...","Stipular spines are infra-in leaf-axils, 8-10 ...","Growth Habit - Tree, Shrub.",Leaf shape: bipinnate leaf,0.910576,Bert,,PlantNet


In [10]:
df_Caribbean_75.sample(n=20, random_state=334, replace=False)

Unnamed: 0,Species,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5,GoogleSentence,Sim Sum,SIM,Jaccard Sum,Dataset
8699,Guaiacum sanctum,Fruit type,Fruit type,['capsule'],Fruit color: yellow orange.,"Fruit covering: fleshy, 5-winged capsule.",Fruit length: ½ inch.,Flower color: bluish purple.,"Seeds brown or black, ellipsoid, 10 to 11 mm.",Fruit type: capsule,0.913161,Bert,,Caribbean
8632,Schoepfia schreberi,Fruit shape,Fruit shape,['globose'],"Spurious fruit ellipsoid, 10-13 mm.",Spurious fruit with the drupe enclosed in the ...,,,,Fruit shape: globose,,Jacc,0.031385,Caribbean
8618,Schoepfia schreberi,Leaf upper side,Leaf upper side,['glabrous'],"Leaves alternate, entire, coriaceous.","Leaf blades lanceolate, ovate, or elliptic, 4 ...","Leaves ovate to lanceolate, 4-8 cm.","Spurious fruit ellipsoid, 10-13 mm.","Roots with small white haustoria, to 3 mm across.",Leaf upper side: glabrous,0.942973,Bert,,Caribbean
7927,Bourreria succulenta,Fruit colour,Fruit colour,"['orange, yellow, red']","Fruits: Orange-red to saffron, attractive, les...",The fruit is a.berry that turns orange-red at ...,"Flowers: White, with five white fused petals.",Leaf stems are yellow-green.,"Fruit turning from green to redorange, 5-12 mm...","Fruit colour: orange, yellow, red",0.910854,Bert,,Caribbean
8545,Randia aculeata,Fruit colour,Fruit colour,"['green, black, whitish']",Fruit color: white.,Fruit shape: oval.,Fruit cover: fleshy.,"Berry globose to ovoid, 1-1.2 cm long, leather...",Fruit length:.5 to 1 inch.,"Fruit colour: green, black, whitish",0.950226,Bert,,Caribbean
8660,Guaiacum officinale,Leaf margin,Leaf margin,['entire'],"Leaf Texture: Smooth, veiny undersurface.","Leaf Type: Once Pinnate, Even-Pinnate, Leaflet...","It has obtuse, ovate or oblong and entire marg...",Each leaf has 2-3 pair of smooth and stalkless...,"Guaiacum is a slow growing, broad leaf and sma...",Leaf margin: entire,,Jacc,0.026728,Caribbean
8296,Vachellia tortuosa,Petals / corolla shape,Petals / corolla shape,['tubular'],"Yellow flowers, stamens numerous.","Leaves: compound, 4-8 pairs of segments, 15-20...",The calyx and corolla are usually 4 to 5-lobed.,"Fruit: a slender moniliform, slightly curved.",Stipular spines: may be fused at bases.,Petals / corolla shape: tubular,0.914081,Bert,,Caribbean
8450,Coccoloba uvifera,Petals / corolla shape,Petals / corolla shape,['tepals'],"Flowers: tepals round to broadly elliptic, mar...","Pistillate flowers: tube obpyriform, 12-20, 8-...",Flowers: Inconspicuous white flowers appear on...,"Inflorescences 10-30 cm, puberulent or glabrou...",Morphology Stem: shoot erect.,Petals / corolla shape: tepals,0.930762,Bert,,Caribbean
8529,Randia aculeata,Leaf margin,Leaf margin,['entire'],Leaf venation: pinnate.,Leaf type: simple.,Leaf shape: orbiculate.,Leaf margin: entire.,Leaf color: green.,Leaf margin: entire,,Jacc,0.066667,Caribbean
8260,Pithecellobium unguis-cati,Leaf composition,Leaf composition,"['bi-pinnate, 2 leaflets per jug']",Alternate hairless leaves have petioles 2 to 5...,The leaves are abruptly bipinnate with a singl...,Broadleaved Perennial Seed propagated Tree Woody.,The stems and branches are gray and nearly smo...,The leaflets are 25-56 mm long and 9-32 mm wid...,"Leaf composition: bi-pinnate, 2 leaflets per jug",0.926979,Bert,,Caribbean
