In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn import metrics
import json
import nltk
from nltk.stem.porter import PorterStemmer
# nltk.download('stopwords')
from nltk.corpus import stopwords
import collections
from itertools import groupby
import pickle
import re

### Load the Data

In [2]:
root = "../data/external/"

# Data Pierre
file_name = root + 'Dataset_Pierre.csv'
df_Pierre = pd.read_csv(file_name, header=[0, 1]) 
df_Pierre = df_Pierre.iloc[: , 1:]

# Data Andrei
file_name = root + 'Dataset_Andrei.csv'
df_Andrei = pd.read_csv(file_name)

# Data Palms
file_name = root + 'Dataset_Kissling.txt'
df_Daniel = pd.read_csv(file_name,
                 sep='\t', encoding='Latin-1')
palm_species = df_Daniel[~df_Daniel.isnull().any(axis=1)]['SpecName'].values
df_Daniel.set_index('SpecName', inplace=True)

# # Values Pierre
# print('Pierre')
# print(df_Pierre.columns.get_level_values(0).unique())

# # Values df_Andrei
# print('Andrei')
# print(df_Andrei.columns.unique())

# # Values df_Andrei
# print('Daniel')
# print(df_Daniel.columns.unique())

In [3]:
sw = list(stopwords.words('english'))
sw.append('like')
sw.append('color')
sw.append('colour')
sw.append('a')
sw.append('x')

### Helper Functions

In [4]:
sw = list(stopwords.words('english'))
sw.append('like')
sw.append('color')
sw.append('colour')
sw.append('a')
sw.append('x')

In [5]:
def jaccard_similarity(A, B):
    """Calculates the Jaccard similarity two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    A = set(A)
    B = set(B)
    
    # Get intersection of two sets
    nominator = A.intersection(B)

    # Find union of two sets
    denominator = A.union(B)

    # Take the ratio of sizes
    similarity = len(nominator)/len(denominator)

    return similarity


def similarity(groundtruth, pred):
    """Calculates the normal similarity between two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    groundtruth = set(groundtruth)
    pred = set(pred)

    # Find intersection of two sets
    nominator = groundtruth.intersection(pred)

    # Find union of two sets
    denominator = groundtruth

    # Take the ratio of sizes
    similarity = len(nominator)/(len(denominator))

    return similarity

## Part of Pierre

In [6]:
root = "../data/processed/"
file_name = root + 'Triples_Pierre.txt'

with open(file_name) as f:
   data = json.load(f)

# species = list(data.keys())
species = df_Pierre.xs('species', level=1, axis=1)['Species'].values

In [7]:
main_traits_Pierre = [
    ('plant type', 'type'), 
    ('phyllotaxis'),
    ('trunk, root'), 
    ('latex'),
    ('crown'),
    ('stem shape', 'stem'), 
    ('bark'), 
    ('bark color', 'bark colour'), 
    ('leaf shape', 'shape', 'leaf'), 
    ('petiole'),
    ('leaf blade', 'blade'), 
    ('leaf margin', 'margin'), 
    ('leaf base', 'base'), 
    ('leaf apex', 'apex'), 
    ('vein'),
    ('tendril'), 
    ('spine'), 
    ('blade color', 'blade'), 
    ('fruit'), 
    ('XXX'), 
    ('inflorescences'),
    ('sexuality', 'sex'), 
    ('flower color', 'flower', 'flower colour', 'color', 'colour'), 
    ('flower shape', 'flower', 'shape')]

In [9]:
# Main Traits 
traits_main_Pierre = df_Pierre.columns.get_level_values(0).unique()[1:]

# Init dict
traits_dict_Pierre = {}

# Extract sub traits per main trait
for main_trait in traits_main_Pierre:

    # Slice dataframe
    sub_traits = list(df_Pierre.xs(main_trait, axis=1).columns)
    sub_traits = [item.lower().split() for item in sub_traits]
    sub_traits = list(set([item for sublist in sub_traits for item in sublist if item not in sw]))
    
    # # Split main traits
    main_traits_split =  main_trait.split()
    main_traits_split.insert(0, main_trait)

    # Add to dict
    traits_dict_Pierre[tuple(set(main_traits_split))] = sub_traits

In [11]:
candidate_list_pierre = []
top_k = 3

for species_ID, spss in enumerate(tqdm(species[0:])):

    # Missing 1 species?
    try:

        for main_trait, main_trait_orig in zip(traits_dict_Pierre.keys(), traits_main_Pierre):
        # for main_trait in main_traits_Pierre:    
            
            # Init variables
            matches_main_traits = []
            matches_sub_traits = []

            # Get present subtraits
            indices = df_Pierre[main_trait_orig].iloc[species_ID].values
            traits = list(df_Pierre.xs(main_trait_orig, axis=1).columns)
            sub_traits = [t.lower() for i, t in zip(indices, traits) if i == 1]
            sub_traits_flat = [item for sublist in sub_traits for item in sublist.split(' ')]

            for idx, lst in enumerate(data[spss]):

                # Flatten list of lists
                flat_list = [item for sublist in lst for item in sublist]
                # Retokens some sentence with multiple 'species'
                flat_list_multiple_sents = [list(v) for k, v in groupby(flat_list, lambda x: x != 'species') if k]
                
                for sentence in flat_list_multiple_sents:
                
                    # Insert species
                    sentence.insert(0, 'species')

                    # Match the main traits
                    main_trait_match = set(main_trait) & set(sentence)
                    gt = set(main_trait_match)
                    pred = set(sentence)
                    j_sim = jaccard_similarity(gt, pred)

                    if j_sim:
                        #print(main_trait, main_trait_match, sentence)
                        matches_main_traits.append((j_sim, sentence))

                        # Match the main traits
                        sub_trait_match = set(sub_traits_flat) & set(sentence)
                        gt = set(sub_trait_match)
                        pred = set(sentence)
                        j_sim = jaccard_similarity(gt, pred)

                        if j_sim:
                            matches_sub_traits.append((j_sim, sentence))

                        else:
                            matches_sub_traits.append((0, []))

                    else:
                        matches_main_traits.append((0, []))
                        matches_sub_traits.append((0, []))

            matches_main_traits.sort(reverse=True)
            matches_sub_traits.sort(reverse=True)
            for k, mmt in enumerate(matches_main_traits[0:top_k]):
                # print(main_trait, mmt)
                candidate_list_pierre.append((spss, main_trait, 'Main', k + 1, mmt[1]))
            for k, mmt in enumerate(matches_sub_traits[0:top_k]):
                # print(sub_traits, mmt)
                candidate_list_pierre.append((spss, sub_traits, 'Sub', k + 1, mmt[1]))
    # Missing 1 species?
    except:
        continue



100%|██████████| 361/361 [00:19<00:00, 18.76it/s]


In [12]:
df_Pierre_traits = pd.DataFrame(candidate_list_pierre, columns=['Species', 'Traits', 'Trait Type', 'Top K', 'Sentence'])
df_Pierre_traits

Unnamed: 0,Species,Traits,Trait Type,Top K,Sentence
0,Acacia amythethophylla,"(type, plant type, plant)",Main,1,"[species, plants, plants, plant, plant, useful]"
1,Acacia amythethophylla,"(type, plant type, plant)",Main,2,"[species, plant, plant, use, use, plant use]"
2,Acacia amythethophylla,"(type, plant type, plant)",Main,3,"[species, plant, plant, sugar, sugar, plant su..."
3,Acacia amythethophylla,"[tree, shrub]",Sub,1,[]
4,Acacia amythethophylla,"[tree, shrub]",Sub,2,[]
...,...,...,...,...,...
51835,Ziziphus spina-christi,"(shape, flower shape, flower)",Main,2,"[species, leaf, leaf, shape, shape, leaf shape]"
51836,Ziziphus spina-christi,"(shape, flower shape, flower)",Main,3,"[species, flowers, flowers, flower, flower of ..."
51837,Ziziphus spina-christi,[five-petalled flower],Sub,1,"[species, plant, plant, plant, plant, flower]"
51838,Ziziphus spina-christi,[five-petalled flower],Sub,2,"[species, flowers, flowers, flower, flower of ..."


## Part Andrei

In [28]:
# Open triples
root = "../data/processed/"
file_name = root + 'Triples_Andrei.txt'

with open(file_name) as f:
   json_data = json.load(f)

# Get species
species = list(json_data.keys())

In [14]:
# Get Dummies to match DF Pierre
df_Andrei_dummies = pd.get_dummies(df_Andrei.iloc[:, 2:])
# Set species back
df_Andrei_dummies = df_Andrei_dummies.set_index(df_Andrei['Species'])

# Create tuple list for multi index
Andrei_multi_index = []
for top_index in df_Andrei.columns:
    for sub_index in df_Andrei_dummies.columns:
        if top_index in sub_index:

            sub_index = sub_index.split('_')[-1]
            Andrei_multi_index.append((top_index, sub_index))

# Set Mutli index
df_Andrei_dummies.columns = pd.MultiIndex.from_tuples(Andrei_multi_index)

In [16]:
# Main Traits 
traits_main_Andrei = df_Andrei.columns.get_level_values(0).unique()[2:]

# Init dict
traits_dict_Andrei = {}

# Extract sub traits per main trait
for main_trait in traits_main_Andrei:

    # Slice dataframe
    sub_traits = list(df_Andrei_dummies.xs(main_trait, axis=1).columns)
    sub_traits = [item.lower().split() for item in sub_traits]
    sub_traits = list(set([item for sublist in sub_traits for item in sublist if item not in sw]))
    
    # Split main traits
    for main_traits_split in main_trait.split():

        # Remove main from sub
        sub_traits = list(set(sub_traits) - set([main_traits_split]))
        # Append traits to dict
        traits_dict_Andrei[main_traits_split] = sub_traits
        
    # Original main trais (Just in case)
    traits_dict_Andrei[main_trait] = sub_traits

In [25]:
df_Andrei_dummies['Life form'].iloc[0].values

array([0, 1], dtype=uint8)

In [29]:
candidate_list_Andrei = []
top_k = 3

for species_ID, spss in tqdm(enumerate(species[0:])):
 
    for main_trait, main_trait_orig in zip(traits_dict_Andrei.keys(), traits_main_Andrei):
    # for main_trait in main_traits_Pierre:    
        
        # Init variables
        matches_main_traits = []
        matches_sub_traits = []

        # Get present subtraits
        indices = df_Andrei_dummies[main_trait_orig].iloc[species_ID].values
        traits = list(df_Andrei_dummies.xs(main_trait_orig, axis=1).columns)
        sub_traits = [t.lower() for i, t in zip(indices, traits) if i == 1]
        sub_traits_flat = [item for sublist in sub_traits for item in sublist.split(' ')]

        for idx, lst in enumerate(json_data[spss]):

            # Flatten list of lists
            flat_list = [item for sublist in lst for item in sublist]
            # Retokens some sentence with multiple 'species'
            flat_list_multiple_sents = [list(v) for k, v in groupby(flat_list, lambda x: x != 'species') if k]
            
            for sentence in flat_list_multiple_sents:
            
                # Insert species
                sentence.insert(0, 'species')

                # Match the main traits
                main_trait_match = set(main_trait) & set(sentence)
                gt = set(main_trait_match)
                pred = set(sentence)
                j_sim = jaccard_similarity(gt, pred)

                if j_sim:
                    #print(main_trait, main_trait_match, sentence)
                    matches_main_traits.append((j_sim, sentence))

                    # Match the main traits
                    sub_trait_match = set(sub_traits_flat) & set(sentence)
                    gt = set(sub_trait_match)
                    pred = set(sentence)
                    j_sim = jaccard_similarity(gt, pred)

                    if j_sim:
                        matches_sub_traits.append((j_sim, sentence))

                    else:
                        matches_sub_traits.append((0, []))

                else:
                    matches_main_traits.append((0, []))
                    matches_sub_traits.append((0, []))

        matches_main_traits.sort(reverse=True)
        matches_sub_traits.sort(reverse=True)
        for k, mmt in enumerate(matches_main_traits[0:top_k]):
            # print(main_trait, mmt)
            candidate_list_Andrei.append((spss, main_trait, 'Main', k + 1, mmt[1]))
        for k, mmt in enumerate(matches_sub_traits[0:top_k]):
            # print(sub_traits, mmt)
            candidate_list_Andrei.append((spss, sub_traits, 'Sub', k + 1, mmt[1]))




42it [00:01, 32.52it/s]


In [30]:
df_Andrei_traits = pd.DataFrame(candidate_list_Andrei, columns=['Species', 'Traits', 'Trait Type', 'Top K', 'Sentence'])
df_Andrei_traits

Unnamed: 0,Species,Traits,Trait Type,Top K,Sentence
0,Avicennia germinans,Life,Main,1,"[species, capsule, capsule, capsule, capsule, ..."
1,Avicennia germinans,Life,Main,2,"[species, fruit, fruit, fruit, fruit, capsule,..."
2,Avicennia germinans,Life,Main,3,[]
3,Avicennia germinans,[tree],Sub,1,[]
4,Avicennia germinans,[tree],Sub,2,[]
...,...,...,...,...,...
5995,Guaiacum sanctum,Inflorescence,Main,2,[]
5996,Guaiacum sanctum,Inflorescence,Main,3,[]
5997,Guaiacum sanctum,[brown],Sub,1,[]
5998,Guaiacum sanctum,[brown],Sub,2,[]


## Part Kissling

### Helper Functions

In [None]:
def knowledge_graph_subset(species, parts, kn_cleaned):

    # Init variables
    baseparts, traits, source, relation, target, correct_parts = ([] for i in range(6))

    # Extract the data
    for (sub, rel, obj) in kn_cleaned:
        #print((sub, rel, obj))
        if sub == 'species':
            sub = species
        source.append(sub), relation.append(rel), target.append(obj), 
        if rel == 'has_main_part':
            baseparts.append(obj)
        if rel == 'has_sub_part':
            traits.append(obj)

    # Fit data into DF
    kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relation})
    if parts:
        if type(parts) != list:
            parts = [parts]
        relations = [f'{part.lower()} temp' for part in parts]
        #relations += ['has_main_part', 'has_sub_part']
    
    values = list(kg_df[kg_df['edge'].isin(relations)]['source'].values)
    values += list(kg_df[kg_df['edge'].isin(relations)]['target'].values) 
    
    return list(set(values))

def possible_parts(species, data):
    return [obj for (sub, rel, obj) in data[species] if rel=='has_main_part']

### Load Data

In [None]:
# Open triples
root = "../data/processed/"
file_name = root + 'Triples_Kissling.pkl'
data = pickle.load(open(file_name, 'rb'))

### Additional Data

In [None]:
part_dict = {
    'stem':
    ['stem', 'trunk', 'plant',],
    'leaf':
    ['leaf', 'leaflet', 'leaves'],
    'fruit':
    ['fruit'],
    'petiole':
    ['petiole'],
    'branch':
    ['branch']
            }

fruitshapes = list(df_Daniel['FruitShape'].unique())
fruitshapes += ['fusiform']

colors = []
for row in df_Daniel['MainFruitColors']:
    try:
        for color in row.split(';'):
            colors.append(color.strip())
    except:
        continue
    

colors = list(set(colors))

multi_index_part = [
     'name', 'name', 'name', 'name',
     'stem', 'stem', 'stem', 'stem', 'stem',
     'leaf', 
     'stem', 'stem', 'stem',
     'leaf', 'leaf', 'branch',
     'petiole', 
     'fruit', 'fruit', 'fruit', 'fruit', 'fruit', 'fruit', 'fruit', 'fruit', 'fruit', 'fruit', 'fruit', 
]

multi_index_binary = [
     'no', 'no', 'no', 'no',
     'yes', 'yes', 'yes', 'yes', 'yes',
     'yes', 
     'no', 'no', 'no',
     'no', 'no', 'no',
     'no', 
     'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 
]

multi_index_int  = [
     'no', 'no', 'no', 'no',
     'yes', 'yes', 'yes', 'yes', 'yes',
     'yes', 
     'yes', 'yes', 'no',
     'yes', 'yes', 'yes',
     'yes', 
     'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'no', 'no', 'no', 
]

df_Daniel.columns = pd.MultiIndex.from_arrays([df_Daniel.columns, multi_index_part, multi_index_binary, multi_index_int])

### Mimick the original DF

In [None]:
palm_information_dict = collections.defaultdict(list)


for palm in tqdm(list(data.keys())[0:]):
    for part in list(part_dict.keys())[0:]:

        # Get palm subset
        # Part
        df_part = df_Daniel.xs(part, axis=1, level=1)
        if part == 'stem':
            # Binary?
            df_part_binary = df_part.xs('yes', axis=1, level=1)
            series_binaries = df_part_binary.loc[palm]
        # Binary?
        df_part_nonbinary = df_part.xs('no', axis=1, level=1)
        if part == 'fruit':
            df_part_nonbinary_strings = df_part_nonbinary.xs('no', axis=1, level=1)
            series_strings = df_part_nonbinary_strings.loc[palm]
            
            #print(series_strings)
        # Integer?
        df_part_nonbinary_integers = df_part_nonbinary.xs('yes', axis=1, level=1)
        # Palmseries
        series_integers = df_part_nonbinary_integers.loc[palm]
        
        # Get kn subset
        graph_array = knowledge_graph_subset(palm, part_dict[part], data[palm])
        if part == 'stem':
            for (name, _), elem in series_binaries.iteritems():
                name = re.sub(r'Stem', '', name)
                name = name.lower()
                if name in graph_array:
                    palm_information_dict[palm].append(1)
                    #print(f'{name} == {elem} == {1.0}')
                else:
                    palm_information_dict[palm].append(0)
                    #print(f'{name} == {elem} == {0.0}')
                    
        if part == 'fruit':
            #print(series_strings)
            for name, elem in series_strings.iteritems():
                if name == 'FruitSizeCategorical':
                    #print(f'{name} SKIPPED')
                    continue
                elif name == 'FruitShape':
                    shape = list(set(fruitshapes) & set(graph_array))
                    if shape:
                        palm_information_dict[palm].append(shape[0])
                        #print(f'{name} == {elem} ==  {shape[0]}')
                    else:
                        palm_information_dict[palm].append(np.NaN)
                        #print(f'{name} == {elem} == NaN')
                elif name == 'FruitColorDescription':
                    #print(f'{name} SKIPPED')
                    continue
                elif name == 'MainFruitColors':
                    #colors = [c.strip() for c in elem.split(';')]
                    found_colors = list(set(colors) & set(graph_array))
                    palm_information_dict[palm].append(found_colors)
                    #print(f'{name} == {colors} ==  {found_colors}')
                elif name == 'Conspicuousness':
                    #print(f'{name} SKIPPED')
                    continue
            
        graph_ints = []
        # Get the ints
        for elem in graph_array:
            try:
                graph_ints.append(float(elem))
            except:
                continue
        #print(graph_ints)
        for name, elem in series_integers.iteritems():
            if type(elem) != str:
                try:
                    closest = min(graph_ints, key=lambda x:abs(x - elem))
                except:
                    closest = np.NaN
                palm_information_dict[palm].append(closest)
                #print(f'{name} == {elem} == {closest}')

# Rename the columns
df_Daniel_own = pd.DataFrame.from_dict(palm_information_dict, orient='index')
df_Daniel_own.columns = [
    'Climbing', 'Acaulescent', 'Erect', 'StemSolitary', 'StemArmed',
    'MaxStemHeight_m', 'MaxStemDia_cm', 'MaxLeafNumber', 'Max_Blade_Length_m',
    'FruitShape', 'MainFruitColors', 
    'AverageFruitLength_cm', 'MinFruitLength_cm', 'MaxFruitLength_cm',
    'AverageFruitWidth_cm', 'MinFruitWidth_cm', 'MaxFruitWidth_cm',
    'Max_Petiole_length_m',
    'Max_Rachis_Length_m',
]

In [None]:
df_Daniel_own

## Exports

In [None]:
folder = "../data/processed/"
df_Andrei_own.to_csv(f'{folder}top_sents_Andrei.csv')
df_Pierre_traits.to_csv(f'{folder}top_sents_Pierre.csv')
# df_Daniel_own.to_csv(f'{folder}top_sents_Andrei.csv')