In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn import metrics
import json
import nltk
from nltk.stem.porter import PorterStemmer
# nltk.download('stopwords')
from nltk.corpus import stopwords
import collections
from itertools import groupby
import pickle
import re
import matplotlib.colors as mcolors

### Load the Data

In [95]:
root = "../../data/external/"

# Data Pierre
file_name = root + 'Dataset_Pierre.csv'
df_Pierre = pd.read_csv(file_name, header=[0, 1]) 
df_Pierre = df_Pierre.iloc[: , 1:]
df_Pierre = df_Pierre.set_index(df_Pierre['Species']['species'])

# Data Andrei
file_name = root + 'Dataset_Andrei.csv'
df_Andrei = pd.read_csv(file_name)

# Get Dummies to match DF Pierre
df_Andrei_dummies = pd.get_dummies(df_Andrei.iloc[:, 2:])
# Set species back
df_Andrei_dummies = df_Andrei_dummies.set_index(df_Andrei['Species'])

# Create tuple list for multi index
Andrei_multi_index = []
for top_index in df_Andrei.columns:
    for sub_index in df_Andrei_dummies.columns:
        if top_index in sub_index:

            sub_index = sub_index.split('_')[-1]
            Andrei_multi_index.append((top_index, sub_index))

# Set Mutli index
df_Andrei_dummies.columns = pd.MultiIndex.from_tuples(Andrei_multi_index)

# Data Palms
file_name = root + 'Dataset_Kissling.txt'
df_Daniel = pd.read_csv(file_name,
                 sep='\t', encoding='Latin-1')
palm_species = df_Daniel[~df_Daniel.isnull().any(axis=1)]['SpecName'].values
df_Daniel.set_index('SpecName', inplace=True)

In [3]:
sw = list(stopwords.words('english'))
sw.append('like')
sw.append('color')
sw.append('colour')
sw.append('a')
sw.append('x')

In [96]:
colors = [color[4:] for color in mcolors.TABLEAU_COLORS.keys()]
colors.extend([color for color in mcolors.CSS4_COLORS.keys()])
colors.extend(
    [
        'whitish', 'bluish', 'reddish', 'greenish', 'backish', 'greyish',
        'backish', 'purplish', 'yellowish', 'orangish', 'brownish', 'pinkish'
    ]
)

traits = {
    'Life Form':
    [
        'Tree', 'Shrub', 'Bush', 'Ficus', 'Strangler', 'Liana', 'Parasitic', 'Palm', 'Herbaceous'
    ],
    'Trunk':
    [
        'Trunk', 'Straight', 'Flared', 'Foothills', 'Silt', 'Aerial'
    ],
    'Root':
    [
        'Root', 'Straight', 'Flared', 'Foothills', 'Silt', 'Aerial'
    ],
    'Latex':
    [
        'Latex'
    ],
    'Phyllotaxis': # Leaf Position
    [
        'Phyllotaxis', 'Alternate', 'Whorled', 'Whorls', 'Opposite'
    ],
    'Leaf Composition':
    [
        'Palmate', 'Pinnate', 'Entire', 'Bi-pinnate'
    ],
    'Crown':
    [
        'Crown'
    ],
    'Stem':
    [
        'Stem', 'Circular', 'Square'
    ],
    'Bark':
    [
        'Bark'
    ],
    'Bark Colour':
    [
        'Bark'
    ],
    'Leaf Shape':
    [
        'Simple', 'Bifoliate', 'Trifoliate', 'Digitized', 'Paripinnate', 'Unipinnate', 'Imperipinnate', 
        'Alternate', 'Bipinnate', 'Pinnate', 'Elliptic', 'Elongate', 'Ovate', 'Round', 'Obovate', 'Lanceolate',
        'Kidney-shaped', 'Heart-shaped', 'Spathulate'
    ],
    'Petiole':
    [
        'Petiole', 'Sessile', 'Petiolated', 'Canaliculate', 'Glands', 'Glandular', 
     'Winged' 'Wings', 'Hairs', 'Hair', 'Translucent'
     ],
    'Leaf Colour':
    [
        'Leaf Colour', 'Leaf Color'
    ],
    'Leaf Blade':
    [
        'Leaf Blade', 'Linear', 'Lanceolate', 'Elliptical', 'Obovate', 'Obtriangular', 
        'Obtriangular', 'Asymmetrical', 'Orbicular', 'Bilobed', 'Lobed', 'Lobes', 'Lobe'
    ],
    'Leaf Base':
    [
        'Leaf Base', 'Rounded', 'Cordate', 'Glands'
    ],
    'Leaf Margin':
    [
        'Margin', 'Smooth', 'Wavy', 'Crenate', 'Toothed', 'Teeth', 'Crenate', 'Serrate'
    ],
    'Leaf Apex':
    [
        'Apex', 'Acuminate', 'Apiculate', 'Mucronate', 'Rounded', 'Emarginated'
    ],
    'Leaf side':
    [
        'Glabrous', 'Pubescent', 'Salt Crystals', 'Scales', 'Woolly', 'Powdery'
    ],
    'Leaf glands':
    [
        'Glands', 'Gland', 'Translucent'
    ],
    'Rachis':
    [
        'Rachis', 'Winged'
    ],
    'Vein':
    [
        'Vein'
    ],
    'Tendril':
    [
        'Tendril'
    ],
    'Spine':
    [
        'Spine', 'Prickle', 'Spines', 'Prickles'
    ],
    'Thornes':
    [
        'Thorn', 'Thornes'
    ],
    'Blade Colour':
    [
        'Blade'
    ],
    'Fruit':
    [
        'Drupe', 'Berry', 'Capsule', 'Pod', 'Follicle', 'Achene', 'Winged', 'Follicle',
        'Pod', 'Nutlet', 'Fruit'
    ],
    'Fruit Shape':
    [
        'locular', 'Globose', 'Flattened', 'Elongate', 'Obovoid', 'Ovate', 'Twisted',
        'Curved', 'Pyriform', 'Ovoid'
    ],
    'Fruit Colour':
    [
        'Fruit'
    ],
    'Inflorescences':
    [
        'Inflorescences', 'Inflorescence', 'Sessile', 'Panicle', 'Flower head', 'Cyme', 'Glomerule', 
        'Fascicle', 'Umbel', 'Corymb', 'Rootlet', 'Spike', 'Dichasium', 'Fascicle',
        'Globose', 'Raceme', 'Fascicle', 'Umbel'
     ],
    'Sexuality':
    [
        'Sexuality', 'Axillary', 'Terminal'
    ],
    'Flower Colour':
    [
        'Flower colour', 'Flower color', 'Flower', 'Flowers'
    ],
    'Flower Shape':
    [
        'Flower shape', 'Petalled', 'Petal', 'Petals', 'Tubular', 'Apetal', 'Butterfly-shaped', 'Shaped', 'Flower', 'Flowers'
    ],
    'Sepal Shape':
    [
        'Sepal', 'Sepals', 'Connate'
    ],
    'Petal Shape':
    [
        'Petal', 'Petals', 'Tepals', 'Tepal', 'Tubular'
    ],
    'Aril Colour':
    [
        'Aril'
    ],
    'Seed Colour':
    [
        'Seed', 
    ]
}

with open('../../data/supportive/traits_Pierre_and_Andrei.json', 'w') as f:
    json.dump(traits, f)
with open('../../data/supportive/colour_list.json', 'w') as f:
    json.dump(colors, f)

traits_list = list(traits.keys())
traits_list += [trait.lower() for lst in list(traits.values()) for trait in lst]

### Helper Functions

In [5]:
sw = list(stopwords.words('english'))
sw.append('like')
sw.append('color')
sw.append('colour')
sw.append('a')
sw.append('x')

In [97]:
def corresponding_keys(val, dictionary):
    """returns the corresponding key of a single value 
    assuming the values are lists.

    Args:
        val (string): string present in the dict
        dictionary (dict): dict with lists of stings as values

    Returns:
        list: list of matching keys
    """
    # Init list
    keys = []
    # Search the dict
    for k, v in dictionary.items():
        if val in v:
            keys.append(k)
    return keys
    
def jaccard_similarity(A, B):
    """Calculates the Jaccard similarity two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    A = set(A)
    B = set(B)
    
    # Get intersection of two sets
    nominator = A.intersection(B)

    # Find union of two sets
    denominator = A.union(B)

    # Take the ratio of sizes
    similarity = len(nominator)/len(denominator)

    return similarity


def similarity(groundtruth, pred):
    """Calculates the normal similarity between two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    groundtruth = set(groundtruth)
    pred = set(pred)

    # Find intersection of two sets
    nominator = groundtruth.intersection(pred)

    # Find union of two sets
    denominator = groundtruth

    # Take the ratio of sizes
    similarity = len(nominator)/(len(denominator))

    return similarity

## Retokize the Data

In [98]:
# Init empty dict for retok
species_datalist = collections.defaultdict(list)

root = "../../data/processed/"
file_name = root + 'Triples_Pierre.txt'

with open(file_name) as f:
   json_data = json.load(f)

for idx, (k, lst) in enumerate(json_data.items()):
    for l in lst:

        # Flatten list of lists
        flat_list = [item for sublist in l for item in sublist]
        # Retokens some sentence with multiple 'species'
        flat_list_multiple_sents = [list(v) for k, v in groupby(flat_list, lambda x: x != 'species') if k]
        
        for sentence in flat_list_multiple_sents:
            # Insert species
            sentence.insert(0, 'species')
            species_datalist[k].append(sentence)

file_name = root + 'Triples_Andrei.txt'

with open(file_name) as f:
   json_data = json.load(f)

for idx, (k, lst) in enumerate(json_data.items()):
    for l in lst:

        # Flatten list of lists
        flat_list = [item for sublist in l for item in sublist]
        # Retokens some sentence with multiple 'species'
        flat_list_multiple_sents = [list(v) for k, v in groupby(flat_list, lambda x: x != 'species') if k]
        
        for sentence in flat_list_multiple_sents:
            # Insert species
            sentence.insert(0, 'species')
            species_datalist[k].append(sentence)


## Match Data

In [99]:
len(df_Pierre.columns.get_level_values(0).unique())


for i in df_Pierre.columns:
    print(i)

('Species', 'species')
('plant type', 'Tree')
('plant type', 'Shrub')
('plant type', 'Bush')
('plant type', 'Ficus strangler')
('plant type', 'Liana')
('plant type', 'Parasitic')
('plant type', 'Palm tree')
('plant type', 'Herbaceous')
('phyllotaxis', 'Phyllotaxis alternate')
('phyllotaxis', 'Opposite phyllotaxis')
('phyllotaxis', 'Phyllotaxis whorled')
('trunk and root', 'Base of trunk straight')
('trunk and root', 'Base of trunk flared')
('trunk and root', 'Foothills')
('trunk and root', 'Stilt roots')
('trunk and root', 'Aerial roots')
('latex', 'No exudate')
('latex', 'Clear latex')
('latex', 'Clear gum or resin')
('latex', 'White latex')
('latex', 'Yellow latex')
('latex', 'Dark gum or resin')
('crown', 'Regular density of the crown')
('crown', 'Leaves grouped at the top of the axes')
('crown', 'Horizontally spreading crowns')
('crown', 'drooping axles')
('crown', 'Stepped crown')
('crown', 'Candelabra-shaped crown')
('stem shape', 'Circular stem section')
('stem shape', 'Square s

In [43]:
traits.keys()

dict_keys(['Life Form', 'Trunk', 'Root', 'Latex', 'Phyllotaxis', 'Leaf Composition', 'Crown', 'Stem', 'Bark', 'Bark Colour', 'Leaf Shape', 'Petiole', 'Leaf Colour', 'Leaf Blade', 'Leaf Base', 'Leaf Margin', 'Leaf Apex', 'Leaf side', 'Leaf glands', 'Rachis', 'Vein', 'Tendril', 'Spine', 'Thornes', 'Blade Colour', 'Fruit', 'Fruit Shape', 'Inflorescences', 'Sexuality', 'Flower Colour', 'Flower Shape', 'Sepal Shape', 'Petal Shape', 'Aril Colour', 'Seed Colour'])

In [100]:
def match_trait_against_DF(trait, df, which_df):
    """Return the corresponding traits

    Args:
        trait (string): The trait used
        df (DataFrame): The inserted DataFrame
        which_df (string): Which of the three DataFrames to use. 
                           Choices are ['Pierre', 'Andrei', 'Kissling']

    Raises:
        ValueError: If a DF other then ['Pierre', 'Andrei', 'Kissling']
                    is used.

    Returns:
        _type_: _description_
    """

    # Error
    df_choices = ['Pierre', 'Andrei', 'Kissling']
    if which_df not in df_choices:
        raise ValueError(f'Choose one of {df_choices}')

    column = ''
    if which_df == 'Pierre':
        if trait == 'Life Form':
            column = 'plant type'
        elif trait == 'Trunk':
            column = 'trunk and root'
        elif trait == 'Root':
            column = 'trunk and root'
        elif trait == 'Stem':
            column = 'stem shape'
        elif trait == 'Bark Colour':
            column = 'bark color'
        elif trait == 'Blade Colour':
            column = 'blade color'
        elif trait == 'Flower Colour':
            column = 'flower color'
        else:
            column = trait.lower()

    if which_df == 'Andrei':
        if trait == 'Phyllotaxis':
            column = ['Leaf position', 'Leaf upper side', 'Leaf lower side']
        elif trait == 'Spine':
            column = 'Thorns/spines'
        elif trait == 'Thornes':
            column = 'Thorns/spines'
        elif trait == 'Fruit':
            column = 'Fruit type'
        else:
            column = trait.lower()

            
    
    return column

In [101]:
species_traits = {}

for idx, (species, sentences) in enumerate(tqdm(species_datalist.items())):

    senteces_matches = collections.defaultdict(list)
    
    # if idx >= 1:
    #     continue

    for sentence in sentences:

        matches =  set(sentence) & set(traits_list)
        matches_color = set(colors) & set(sentence)
        if matches and not matches_color:
            #print(match, sentence)
            for match in matches:

                corresponding_traits = corresponding_keys(match.capitalize(), traits)

                for corresponding_trait in corresponding_traits:
                    senteces_matches[corresponding_trait].append(sentence)

                    #print(corresponding_trait, sentence)

        elif matches_color and matches:
            for match in matches:

                corresponding_traits = corresponding_keys(match.capitalize(), traits)
                for corresponding_trait in corresponding_traits:
                    if 'Colour' in corresponding_trait.split(' '):
                        senteces_matches[corresponding_trait].append(sentence)

                        #print("COLOR", corresponding_trait, sentence)

    species_traits[species] = senteces_matches

100%|██████████| 401/401 [00:00<00:00, 436.40it/s]


In [61]:
'Acacia amythethophylla' in df_Pierre['Species']['species'].values

True

In [94]:
df_subset = df_Pierre[df_Pierre.index == 'Acacia amythethophylla']
for i in df_subset.columns:
    print(i)

('Species', 'species')
('plant type', 'Tree')
('plant type', 'Shrub')
('plant type', 'Bush')
('plant type', 'Ficus strangler')
('plant type', 'Liana')
('plant type', 'Parasitic')
('plant type', 'Palm tree')
('plant type', 'Herbaceous')
('phyllotaxis', 'Phyllotaxis alternate')
('phyllotaxis', 'Opposite phyllotaxis')
('phyllotaxis', 'Phyllotaxis whorled')
('trunk and root', 'Base of trunk straight')
('trunk and root', 'Base of trunk flared')
('trunk and root', 'Foothills')
('trunk and root', 'Stilt roots')
('trunk and root', 'Aerial roots')
('latex', 'No exudate')
('latex', 'Clear latex')
('latex', 'Clear gum or resin')
('latex', 'White latex')
('latex', 'Yellow latex')
('latex', 'Dark gum or resin')
('crown', 'Regular density of the crown')
('crown', 'Leaves grouped at the top of the axes')
('crown', 'Horizontally spreading crowns')
('crown', 'drooping axles')
('crown', 'Stepped crown')
('crown', 'Candelabra-shaped crown')
('stem shape', 'Circular stem section')
('stem shape', 'Square s

In [102]:
for idx, species in enumerate(species_traits.keys()):

    if idx >= 1:
        continue

    df_name = ''
    df_select = [df_Andrei_dummies, df_Pierre]
    if species in df_Andrei_dummies.index:
        df_name = 'Andrei'
        df_select = df_select[0]
    elif species in df_Pierre.index:
        df_name = 'Pierre'
        df_select = df_select[1]
    else:
        print(species, 'Missing?', df_name)

    for trait in species_traits[species]:

        print(df_name, match_trait_against_DF(trait, df_select, df_name))
        df_trait = match_trait_against_DF(trait, df_select, df_name)

        # Get present subtraits
        try:
            df_subset = df_select[df_select.index == species][df_trait]
            present_traits = df_subset.loc[:, df_subset.any()].columns
        except:
            print(df_trait, 'Trait not present in DF')

        # print(present_traits)




    #print(species, df)


Pierre bark
Pierre bark color
Pierre trunk and root
Pierre stem shape
Pierre trunk and root
Pierre seed colour
seed colour Trait not present in DF
Pierre leaf shape
Pierre leaf margin
Pierre fruit
Pierre fruit colour
fruit colour Trait not present in DF
Pierre flower color
Pierre sexuality
Pierre inflorescences
Pierre flower shape
