In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn import metrics
import json
import nltk
from nltk.stem.porter import PorterStemmer
# nltk.download('stopwords')
from nltk.corpus import stopwords
import collections
from itertools import groupby
import pickle
import re

### Load the Data

In [2]:
root = "../data/external/"

# Data Pierre
file_name = root + 'Dataset_Pierre.csv'
df_Pierre = pd.read_csv(file_name, header=[0, 1]) 
df_Pierre = df_Pierre.iloc[: , 1:]

# Data Andrei
file_name = root + 'Dataset_Andrei.csv'
df_Andrei = pd.read_csv(file_name)

# Data Palms
file_name = root + 'Dataset_Kissling.txt'
df_Daniel = pd.read_csv(file_name,
                 sep='\t', encoding='Latin-1')
palm_species = df_Daniel[~df_Daniel.isnull().any(axis=1)]['SpecName'].values
df_Daniel.set_index('SpecName', inplace=True)

# # Values Pierre
# print('Pierre')
# print(df_Pierre.columns.get_level_values(0).unique())

# # Values df_Andrei
# print('Andrei')
# print(df_Andrei.columns.unique())

# # Values df_Andrei
# print('Daniel')
# print(df_Daniel.columns.unique())

In [3]:
sw = list(stopwords.words('english'))
sw.append('like')
sw.append('color')
sw.append('colour')
sw.append('a')
sw.append('x')

### Helper Functions

In [4]:
sw = list(stopwords.words('english'))
sw.append('like')
sw.append('color')
sw.append('colour')
sw.append('a')
sw.append('x')

In [5]:
def jaccard_similarity(A, B):
    """Calculates the Jaccard similarity two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    A = set(A)
    B = set(B)
    
    # Get intersection of two sets
    nominator = A.intersection(B)

    # Find union of two sets
    denominator = A.union(B)

    # Take the ratio of sizes
    similarity = len(nominator)/len(denominator)

    return similarity


def similarity(groundtruth, pred):
    """Calculates the normal similarity between two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    groundtruth = set(groundtruth)
    pred = set(pred)

    # Find intersection of two sets
    nominator = groundtruth.intersection(pred)

    # Find union of two sets
    denominator = groundtruth

    # Take the ratio of sizes
    similarity = len(nominator)/(len(denominator))

    return similarity

## Part of Pierre

In [6]:
root = "../data/processed/"
file_name = root + 'Triples_Pierre.txt'

with open(file_name) as f:
   data = json.load(f)

species = list(data.keys())

In [7]:
# Main Traits 
traits_main_Pierre = df_Pierre.columns.get_level_values(0).unique()[1:]

# Init dict
traits_sub_Pierre = {}

# Extract sub traits per main trait
for main_trait in traits_main_Pierre:

    # Slice dataframe
    sub_traits = list(df_Pierre.xs(main_trait, axis=1).columns)
    sub_traits = [item.lower().split() for item in sub_traits]
    sub_traits = list(set([item for sublist in sub_traits for item in sublist if item not in sw]))
    
    # Split main traits
    for main_traits_split in main_trait.split():

        # Remove main from sub
        sub_traits = list(set(sub_traits) - set([main_traits_split]))
        # Append traits to dict
        traits_sub_Pierre[main_traits_split] = sub_traits
        
    # Original main trais (Just in case)
    traits_sub_Pierre[main_trait] = sub_traits

In [8]:
candidate_dict_pierre = collections.defaultdict(list)

for spss in tqdm(species[0:]):

    for idx, lst in enumerate(data[spss]):

        # Flatten list of lists
        flat_list = [item for sublist in lst for item in sublist]
        # Retokens some sentence with multiple 'species'
        flat_list_multiple_sents = [list(v) for k, v in groupby(flat_list, lambda x: x != 'species') if k]
        for sentence in flat_list_multiple_sents:
            sentence.insert(0, 'species')

            # Match the main traits
            main_traits = list(traits_sub_Pierre.keys())
            main_trait_match = set(main_traits) & set(sentence)
            if main_trait_match:

                gt = set(main_trait_match)
                pred = set(sentence)
                j_sim = jaccard_similarity(gt, pred)

                candidate_dict_pierre[(spss, 'Main Trait')].append((j_sim, sentence))

                # Match the sub traits
                for trait_match in main_trait_match:
                    sub_traits = traits_sub_Pierre[trait_match]
                    sub_trait_match = set(sub_traits) & set(sentence)
                    if sub_trait_match:

                        gt = set(main_trait_match) | set(sub_trait_match)
                        pred = set(sentence)
                        j_sim = jaccard_similarity(gt, pred)

                        candidate_dict_pierre[(spss, 'Sub Trait')].append((j_sim, sentence))
                    
                    else:
                        candidate_dict_pierre[(spss, 'Sub Trait')].append((0, []))
            else:
                candidate_dict_pierre[(spss, 'Main Trait')].append((0, []))
                candidate_dict_pierre[(spss, 'Sub Trait')].append((0, []))

                        # print('SPECIES', spss)
                        # print('MAIN', main_trait_match) 
                        # print('SUB', sub_trait_match)  
                        # print(jaccard_similarity(gt, pred))    
                        # print('SENT', flat_list, '\n') 

100%|██████████| 360/360 [00:00<00:00, 493.29it/s]


In [9]:
candidate_dict_pierre_sorted = collections.defaultdict(list)

for species in candidate_dict_pierre.keys():
    # candidate_dict_pierre[species] = candidate_dict_pierre[species] = list(set(candidate_dict_pierre[species]))
    candidate_dict_pierre_sorted[species] = candidate_dict_pierre[species].sort(reverse=True)

df_Pierre_own = pd.DataFrame.from_dict(candidate_dict_pierre, orient='index')
df_Pierre_own.index = pd.MultiIndex.from_tuples(df_Pierre_own.index)

In [10]:
df_Pierre_own

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,9,...,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014
Acacia amythethophylla,Main Trait,"(0.5, [species, stem, stem, stem, stem, vs, vs...","(0.5, [species, stem, stem, stem, stem, vs, vs...","(0.5, [species, stem, stem, bark, bark, stem-b...","(0.5, [species, stem, stem, bark, bark, stem-b...","(0.4, [species, trunk, trunk, bark, bark, trun...","(0.3333333333333333, [species, stem, stem, ste...","(0.3333333333333333, [species, root, root, roo...","(0.2857142857142857, [species, trunk, trunk, b...","(0.2857142857142857, [species, root, root, bar...","(0.25, [species, stem, stem, stem, stem, peak,...",...,,,,,,,,,,
Acacia amythethophylla,Sub Trait,"(0.16666666666666666, [species, roots, roots, ...","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])",...,,,,,,,,,,
Acacia ataxacantha,Main Trait,"(0.6, [species, plant, plant, type, type, plan...","(0.6, [species, plant, plant, type, type, plan...","(0.5, [species, plant, plant, type, type, plan...","(0.3333333333333333, [species, plant, plant, p...","(0.3333333333333333, [species, plant, plant, p...","(0.3333333333333333, [species, plant, plant, p...","(0.3333333333333333, [species, plant, plant, p...","(0.3333333333333333, [species, plant, plant, p...","(0.3333333333333333, [species, petiole, petiol...","(0.2857142857142857, [species, pinnae, pinnae,...",...,,,,,,,,,,
Acacia ataxacantha,Sub Trait,"(0.8333333333333334, [species, plant, plant, t...","(0.8333333333333334, [species, plant, plant, t...","(0.8333333333333334, [species, plant, plant, t...","(0.8, [species, plant, plant, type, type, plan...","(0.8, [species, plant, plant, type, type, plan...","(0.8, [species, plant, plant, type, type, plan...","(0.2857142857142857, [species, roots, roots, r...","(0.15, [species, plant, plant, link, link, pla...","(0, [])","(0, [])",...,,,,,,,,,,
Acacia dudgeoni,Main Trait,"(0.5, [species, stem, stem, bark, bark, stem b...","(0.3333333333333333, [species, stem, stem, ste...","(0.3333333333333333, [species, stem, stem, bar...","(0.3333333333333333, [species, plant, plant, p...","(0.3333333333333333, [species, leaf, leaf, lea...","(0.25, [species, stems, stems, stem, stem, gro...","(0.25, [species, plant, plant, plant, plant, w...","(0.25, [species, plant, plant, plant, plant, t...","(0.25, [species, plant, plant, plant, plant, p...","(0.25, [species, plant, plant, development, de...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ziziphus mauritiana,Sub Trait,"(0.5555555555555556, [species, leaf, leaf, sha...","(0.5, [species, roots, roots, root, root, usef...","(0.4, [species, roots, roots, root, root of z....","(0.2857142857142857, [species, roots, roots, r...","(0.2857142857142857, [species, plants, plants,...","(0.25, [species, plant, plant, plant, plant, p...","(0, [])","(0, [])","(0, [])","(0, [])",...,,,,,,,,,,
Ziziphus mucronata,Main Trait,"(0.75, [species, plant, plant, type, type, pla...","(0.6, [species, plant, plant, type, type, plan...","(0.5, [species, stem, stem, bark, bark, stem b...","(0.5, [species, plants, plants, plant, plant, ...","(0.5, [species, plant])","(0.5, [species, plant])","(0.4, [species, stem, stem, bark, bark, stem b...","(0.3333333333333333, [species, stem, stem, ste...","(0.3333333333333333, [species, plant, plant, p...","(0.3333333333333333, [species, plant, plant, p...",...,,,,,,,,,,
Ziziphus mucronata,Sub Trait,"(0.8, [species, plant, plant, type, type, plan...","(0.8, [species, plant, plant, type, type, plan...","(0.8, [species, plant, plant, type, type, plan...","(0.5, [species, roots, roots, root, root, bake])","(0.2857142857142857, [species, roots, roots, r...","(0.2857142857142857, [species, roots, roots, r...","(0.15, [species, plant, plant, link, link, pla...","(0, [])","(0, [])","(0, [])",...,,,,,,,,,,
Ziziphus spina-christi,Main Trait,"(0.75, [species, leaf, leaf, shape, shape, lea...","(0.75, [species, leaf, leaf, blade, blade, lea...","(0.6666666666666666, [species, plant, plant, p...","(0.6, [species, plant, plant, type, type, plan...","(0.5, [species, stem, stem, bark, bark, stem b...","(0.5, [species, stem, stem, bark, bark, stem b...","(0.5, [species, stem, stem, bark, bark, stem b...","(0.5, [species, stem, stem, bark, bark, stem b...","(0.5, [species, stem, stem, bark, bark, stem b...","(0.5, [species, stem, stem, bark, bark, stem b...",...,,,,,,,,,,


## Part Andrei

In [11]:
# Open triples
root = "../data/processed/"
file_name = root + 'Triples_Andrei.txt'

with open(file_name) as f:
   json_data = json.load(f)

# Get species
species = list(json_data.keys())

In [12]:
# Get Dummies to match DF Pierre
df_Andrei_dummies = pd.get_dummies(df_Andrei.iloc[:, 2:])
# Set species back
df_Andrei_dummies = df_Andrei_dummies.set_index(df_Andrei['Species'])

# Create tuple list for multi index
Andrei_multi_index = []
for top_index in df_Andrei.columns:
    for sub_index in df_Andrei_dummies.columns:
        if top_index in sub_index:

            sub_index = sub_index.split('_')[-1]
            Andrei_multi_index.append((top_index, sub_index))

# Set Mutli index
df_Andrei_dummies.columns = pd.MultiIndex.from_tuples(Andrei_multi_index)

In [13]:
# Main Traits 
traits_main_Andrei = df_Andrei.columns.get_level_values(0).unique()[2:]

# Init dict
traits_sub_Andrei = {}

# Extract sub traits per main trait
for main_trait in traits_main_Andrei:

    # Slice dataframe
    sub_traits = list(df_Andrei_dummies.xs(main_trait, axis=1).columns)
    sub_traits = [item.lower().split() for item in sub_traits]
    sub_traits = list(set([item for sublist in sub_traits for item in sublist if item not in sw]))
    
    # Split main traits
    for main_traits_split in main_trait.split():

        # Remove main from sub
        sub_traits = list(set(sub_traits) - set([main_traits_split]))
        # Append traits to dict
        traits_sub_Andrei[main_traits_split] = sub_traits
        
    # Original main trais (Just in case)
    traits_sub_Andrei[main_trait] = sub_traits

In [14]:
candidate_dict_andrei = collections.defaultdict(list)

for spss in tqdm(species[0:]):

    for idx, lst in enumerate(json_data[spss]):

        # Flatten list of lists
        flat_list = [item for sublist in lst for item in sublist]
        # Retokens some sentence with multiple 'species'
        flat_list_multiple_sents = [list(v) for k, v in groupby(flat_list, lambda x: x != 'species') if k]
        for sentence in flat_list_multiple_sents:
            sentence.insert(0, 'species')

            # Match the main traits
            main_traits = list(traits_sub_Andrei.keys())
            main_trait_match = set(main_traits) & set(sentence)
            if main_trait_match:

                gt = set(main_trait_match)
                pred = set(sentence)
                j_sim = jaccard_similarity(gt, pred)

                candidate_dict_andrei[(spss, 'Main Trait')].append((j_sim, sentence))

                # Match the sub traits
                for trait_match in main_trait_match:
                    sub_traits = traits_sub_Andrei[trait_match]
                    sub_trait_match = set(sub_traits) & set(sentence)
                    if sub_trait_match:

                        gt = set(main_trait_match) | set(sub_trait_match)
                        pred = set(sentence)
                        j_sim = jaccard_similarity(gt, pred)

                        candidate_dict_andrei[(spss, 'Sub Trait')].append((j_sim, sentence))
                    
                    else:
                        candidate_dict_andrei[(spss, 'Sub Trait')].append((0, []))
            else:
                candidate_dict_andrei[(spss, 'Main Trait')].append((0, []))
                candidate_dict_andrei[(spss, 'Sub Trait')].append((0, []))

                        # print('SPECIES', spss)
                        # print('MAIN', main_trait_match) 
                        # print('SUB', sub_trait_match)  
                        # print(jaccard_similarity(gt, pred))    
                        # print('SENT', flat_list, '\n') 

100%|██████████| 42/42 [00:00<00:00, 1170.76it/s]


In [15]:
candidate_dict_andrei_sorted = collections.defaultdict(list)

for species in candidate_dict_andrei.keys():
    # candidate_dict_pierre[species] = candidate_dict_pierre[species] = list(set(candidate_dict_pierre[species]))
    candidate_dict_andrei_sorted[species] = candidate_dict_andrei[species].sort(reverse=True)

df_Andrei_own = pd.DataFrame.from_dict(candidate_dict_andrei, orient='index')
df_Andrei_own.index = pd.MultiIndex.from_tuples(df_Andrei_own.index)

In [16]:
df_Andrei_own.index = pd.MultiIndex.from_tuples(df_Andrei_own.index)

In [17]:
df_Andrei_own

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,9,...,737,738,739,740,741,742,743,744,745,746
Avicennia germinans,Main Trait,"(0.25, [species, glands, glands, gland, gland,...","(0.2, [species, plant, plant, form, form, plan...","(0.16666666666666666, [species, plant, plant, ...","(0.14285714285714285, [species, seeds, seeds, ...","(0.1111111111111111, [species, gametophyte, ga...","(0.1, [species, calyx, calyx, calyx, calyx, sh...","(0.05263157894736842, [species, corolla, corol...","(0, [])","(0, [])","(0, [])",...,,,,,,,,,,
Avicennia germinans,Sub Trait,"(0.5, [species, glands, glands, gland, gland, ...","(0.3, [species, calyx, calyx, calyx, calyx, sh...","(0.2857142857142857, [species, seeds, seeds, s...","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])",...,,,,,,,,,,
Metopium brownei,Main Trait,"(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])",...,,,,,,,,,,
Metopium brownei,Sub Trait,"(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])",...,,,,,,,,,,
Handroanthus billbergii,Main Trait,"(0.3333333333333333, [species, corolla, coroll...","(0.25, [species, calyx, calyx, calyx, calyx, d...","(0.16666666666666666, [species, plant, plant, ...","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Jacquinia arborea,Sub Trait,"(0.375, [species, corolla, corolla, corolla, c...","(0.23076923076923078, [species, calyx, calyx, ...","(0.21428571428571427, [species, calyx, calyx, ...","(0.18181818181818182, [species, corolla, corol...","(0.16666666666666666, [species, flowers, flowe...","(0.15384615384615385, [species, flowers, flowe...","(0.14634146341463414, [species, calyx, calyx, ...","(0.13636363636363635, [species, corolla, corol...","(0.1111111111111111, [species, corolla, coroll...","(0.0975609756097561, [species, calyx, calyx, c...",...,,,,,,,,,,
Guaiacum officinale,Main Trait,"(0.3333333333333333, [species, calyx, calyx, c...","(0.25, [species, plant, plant, type, type, pla...","(0.25, [species, flower, flower, flower, flowe...","(0.2, [species, plant, plant, form, form, plan...","(0.16666666666666666, [species, plant, plant, ...","(0.16666666666666666, [species, plant, plant, ...","(0.16666666666666666, [species, fruits, fruits...","(0.16666666666666666, [species, calyx, calyx, ...","(0.1111111111111111, [species, leaf, leaf, typ...","(0.1111111111111111, [species, gametophyte, ga...",...,,,,,,,,,,
Guaiacum officinale,Sub Trait,"(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])","(0, [])",...,,,,,,,,,,
Guaiacum sanctum,Main Trait,"(0.2, [species, plant, plant, form, form, plan...","(0.2, [species, leaf, leaf, type, type, leaf t...","(0.16666666666666666, [species, plant, plant, ...","(0.16666666666666666, [species, leaf, leaf, sh...","(0.16666666666666666, [species, crown, crown, ...","(0.16666666666666666, [species, calyx, calyx, ...","(0.14285714285714285, [species, leaf, leaf, ty...","(0.125, [species, corolla, corolla, corolla, c...","(0.1111111111111111, [species, gametophyte, ga...","(0, [])",...,,,,,,,,,,


## Part Kissling