In [13]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn import metrics
import json
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet as wn
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
import collections
from itertools import groupby
import pickle
import re
import matplotlib.colors as mcolors
import warnings
import requests
import csv
from sklearn.preprocessing import MultiLabelBinarizer
from functools import reduce
from dotenv import load_dotenv

In [14]:
load_dotenv()

True

### Global Variables & Functions

#### Variables

In [15]:
nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}

with open('../../data/supportive/colour_list.json', 'r') as f:
  colors = json.load(f)

with open('../../data/supportive/trait_dictionary.json', 'r') as f:
  traits_dict = json.load(f)
traits_list = list(traits_dict.keys())
traits_list += [trait.lower() for lst in list(traits_dict.values()) for trait in lst]

#### Functions

In [16]:
def corresponding_keys(val, dictionary):
    """returns the corresponding key of a single value 
    assuming the values are lists.

    Args:
        val (string): string present in the dict
        dictionary (dict): dict with lists of stings as values

    Returns:
        list: list of matching keys
    """
    # Init list
    keys = []
    # Search the dict
    for k, v in dictionary.items():
        if val in v:
            keys.append(k)
    return keys
    
def jaccard_similarity(A, B):
    """Calculates the Jaccard similarity two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    A = set(A.lower().split())
    B = set(B.lower().split())
    
    # Get intersection of two sets
    nominator = A.intersection(B)

    # Find union of two sets
    denominator = A.union(B)

    # Take the ratio of sizes
    similarity = len(nominator)/len(denominator)

    return similarity


def similarity(groundtruth, pred):
    """Calculates the normal similarity between two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    groundtruth = set(groundtruth)
    pred = set(pred)

    # Find intersection of two sets
    nominator = groundtruth.intersection(pred)

    # Find union of two sets
    denominator = groundtruth

    # Take the ratio of sizes
    similarity = len(nominator)/(len(denominator))

    return similarity

def resentesize(lst):

    sentence = ''

    # Loop every 2 items 
    for item1, item2 in zip(lst[::2], lst[1::2]):
        
        # Set verb
        verb = 'has'
        chunk = item2.split(' ')[-1]
        if chunk not in nouns or chunk in colors:
            verb = 'is'

        sentence += f'{item1.capitalize()} {verb} {item2}. '
    
    return sentence


def get_wiki_main_image(title):
    url = 'https://en.wikipedia.org/w/api.php'
    data = {
        'action' :'query',
        'format' : 'json',
        'formatversion' : 2,
        'prop' : 'pageimages|pageterms',
        'piprop' : 'original',
        'titles' : title
    }
    response = requests.get(url, data)
    json_data = json.loads(response.text)
    return json_data['query']['pages'][0]['original']['source'] if len(json_data['query']['pages']) >0 else 'Not found'


### Load GT Data

In [17]:
root = "../../data/external/"

#### Data Andrei

In [18]:
file_name = root + 'Dataset_Andrei.csv'
df_Andrei = pd.read_csv(file_name)

# Get Dummies to match DF Pierre
df_Andrei_dummies = pd.get_dummies(df_Andrei.iloc[:, 2:])
# Set species back
df_Andrei_dummies = df_Andrei_dummies.set_index(df_Andrei['Species'])

# Create tuple list for multi index
Andrei_multi_index = []
for top_index in df_Andrei.columns:
    for sub_index in df_Andrei_dummies.columns:
        if top_index in sub_index:

            sub_index = sub_index.split('_')[-1]
            Andrei_multi_index.append((top_index, sub_index))

# Set Mutli index
df_Andrei_dummies.columns = pd.MultiIndex.from_tuples(Andrei_multi_index)
df_Andrei_dummies

Unnamed: 0_level_0,Life form,Life form,Leaf position,Leaf position,Leaf position,Leaf position,Leaf position,Leaf composition,Leaf composition,Leaf composition,...,Aril colour,Aril colour,Aril colour,Aril colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour
Unnamed: 0_level_1,liana,tree,alternate,"alternate, opposite",opposite,"opposite, whorls of 3","opposite, whorls of 3, alternate",3 palmate,3-5 palmate,3-5 pinnate,...,orange,red,white,yellow-geen,black,brown,green,grey,white,whitish
Species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Avicennia germinans,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Metopium brownei,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Handroanthus billbergii,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
Bourreria succulenta,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bursera karsteniana,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bursera simaruba,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bursera tomentosa,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cynophalla flexuosa,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
Cynophalla hastata,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
Quadrella indica,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


#### Data Pierre

In [20]:
file_name = root + 'Dataset_Pierre.csv'
df_Pierre = pd.read_csv(file_name, header=[0, 1]) 
df_Pierre = df_Pierre.iloc[: , 1:]
df_Pierre = df_Pierre.set_index(df_Pierre['Species']['species'])
df_Pierre = df_Pierre.drop(columns=['Species', 'XXX'])

  df_Pierre = df_Pierre.drop(columns=['Species', 'XXX'])


#### Data Palms

In [21]:
file_name = root + 'Dataset_Kissling.txt'
df_Daniel = pd.read_csv(file_name,
                 sep='\t', encoding='Latin-1')
palm_species = df_Daniel[~df_Daniel.isnull().any(axis=1)]['SpecName'].values
df_Daniel.set_index('SpecName', inplace=True)

### RESHAPE DATA FOR PALMS

# Exclude string types
df_Daniels_int = df_Daniel.select_dtypes(exclude=[object])
df_Daniels_str = df_Daniel.select_dtypes(include=[object])
# Drop numbers
df_Daniels_semi_ints = df_Daniels_int.loc[:, df_Daniels_int.max() <= 3]
# Merge again
df_Daniel_edit = pd.merge(df_Daniels_str, df_Daniels_semi_ints, left_index=True, right_index=True)


# Real numbers:
df_Daniels_real_ints = df_Daniels_int.loc[:, df_Daniels_int.max() >= 3]
columns = [
    ("Measurement", "Maximum Stem Height in Meters"),
    ("Measurement", "Maximum Stem Diameter in Centimeters"),
    ("Measurement", "Maximum Leaf Number"),
    ("Measurement", "Maximum Leaf Blade Length in Meters"),
    ("Measurement", "Maximum Rachis Length in Meters"),
    ("Measurement", "Maximum Petiole Length in Meters"),
    ("Measurement", "Average Fruit Length in Centimeters"),
    ("Measurement", "Minimum Fruit Length in Centimeters"),
    ("Measurement", "Maximum Fruit Length in Centimeters"),
    ("Measurement", "Average Fruit Width in Centimeters"),
    ("Measurement", "Minimum Fruit Width in Centimeters"),
    ("Measurement", "Maximum Fruit Width in Centimeters"),
]

df_Daniels_real_ints.columns = pd.MultiIndex.from_tuples(columns)
df_Daniels_real_ints

# Get colors as lst of lsts 
FruitColorDescription_colors_lst = []

for palm_colors in df_Daniels_str['FruitColorDescription'].values:
    if type(palm_colors) == str:
        #print(type(colors))
        palm_colors = re.split(r'; |to | |-', palm_colors)

        #print(palm_colors)
        FruitColorDescription_colors_lst.append([color for color in palm_colors if color in colors])
    else:
        FruitColorDescription_colors_lst.append([])

MainFruitColors_colors_lst = []

for palm_colors in df_Daniels_str['MainFruitColors'].values:
    if type(palm_colors) == str:
        #print(type(colors))
        palm_colors = re.split(r'; |to | |-', palm_colors)

        #print(palm_colors)
        MainFruitColors_colors_lst.append([color for color in palm_colors if color in colors])
    else:
        MainFruitColors_colors_lst.append([])

# Init SKlearn MLB
mlb = MultiLabelBinarizer()

# Create dummies for color columns
df_FruitColorDescription = pd.DataFrame(
    {
        'FruitColorDescription': FruitColorDescription_colors_lst
    }, columns=['FruitColorDescription'])

s = df_FruitColorDescription['FruitColorDescription']
df_FruitColorDescription = pd.DataFrame(mlb.fit_transform(s),columns=mlb.classes_, index=df_Daniel.index)

# Multiindex columns
columns = [('Fruit Colour Description', column) for column in df_FruitColorDescription.columns]
df_FruitColorDescription.columns = pd.MultiIndex.from_tuples(columns)

# Create dummies for color columns
df_MainFruitColors = pd.DataFrame(
    {
        'MainFruitColors': MainFruitColors_colors_lst
    }, columns=['MainFruitColors'])

s = df_MainFruitColors['MainFruitColors']
df_MainFruitColors = pd.DataFrame(mlb.fit_transform(s),columns=mlb.classes_, index=df_Daniel.index)

# Multiindex columns
columns = [('Fruit Colour', column) for column in df_MainFruitColors.columns]
df_MainFruitColors.columns = pd.MultiIndex.from_tuples(columns)

df_Daniels_str_non_color = df_Daniels_str[['UnderstoreyCanopy', 'FruitSizeCategorical', 'FruitShape', 'Conspicuousness']]

# df_Daniels_str_non_color.columns = pd.MultiIndex.from_tuples(
#     [
#         ('Crown', 'UnderstoreyCanopy'),
#         ('Fruit Size', 'FruitSizeCategorical'),
#         ('Fruit Shape', 'FruitShape'),
#         ('Conspicuousness', 'Conspicuousness'),
#     ]
# )

df_Daniels_str_non_color_dummies = pd.get_dummies(df_Daniels_str_non_color)
columns = []
for column in df_Daniels_str_non_color_dummies.columns:
    level0, level1 = column.split('_')
    if level0 == 'UnderstoreyCanopy':
        level0 = 'Crown'
    elif level0 == 'FruitSizeCategorical':
        level0 = 'Fruit Size'
    elif level0 == 'FruitShape':
        level0 = 'Fruit Shape'
    elif level0 == 'Conspicuousness':
        level0 = 'Conspicuousness'
    columns.append((level0, level1))
    
df_Daniels_str_non_color_dummies.columns = pd.MultiIndex.from_tuples(columns)

### JOIN ALL DATA
data_frames = [df_FruitColorDescription, df_MainFruitColors, df_Daniels_str_non_color_dummies, df_Daniels_real_ints]
df_Daniel_merged = pd.concat(data_frames, axis=1)
df_Daniel_merged

Unnamed: 0_level_0,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,...,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement
Unnamed: 0_level_1,black,blue,bluish,brown,brownish,chocolate,coral,crimson,darkgreen,green,...,Maximum Leaf Number,Maximum Leaf Blade Length,Maximum Rachis Length,Maximum Petiole Length,Average Fruit Length,Minimum Fruit Length,Maximum Fruit Length,Average Fruit Width,Minimum Fruit Width,Maximum Fruit Width
SpecName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Acanthophoenix crinita,1,0,0,0,0,0,0,0,0,0,...,15.0,2.30,,,0.65,0.6,0.7,0.50,,
Acanthophoenix rousselii,1,0,0,0,0,0,0,0,0,0,...,,3.00,,,2.00,,,0.80,,
Acanthophoenix rubra,1,0,0,0,0,0,0,0,0,0,...,20.0,3.10,3.0,,1.00,,,0.70,,
Acoelorrhaphe wrightii,1,0,0,1,0,0,0,0,0,0,...,25.0,1.30,0.7,0.65,0.70,,,0.70,0.5,0.9
Acrocomia aculeata,0,0,0,0,0,0,0,0,0,1,...,30.0,3.50,2.5,,4.25,3.5,5.0,4.60,3.8,5.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wettinia quinaria,0,0,0,1,0,0,0,0,0,1,...,6.0,5.50,3.2,1.35,2.75,2.0,3.5,2.00,1.5,2.5
Wettinia radiata,0,0,0,0,0,0,0,0,0,0,...,6.0,4.33,3.4,,4.00,3.0,4.0,2.00,,
Wettinia verruculosa,0,0,0,1,0,0,0,0,0,1,...,6.0,6.00,3.7,2.40,2.50,,,1.50,,
Wodyetia bifurcata,0,0,0,0,0,0,0,0,0,0,...,10.0,3.20,2.3,1.02,5.75,5.0,6.5,4.35,2.7,6.0


## Prediction Data

### Load Data

In [22]:
root = "../../data/processed/"
sentences_all = {}

f = open(F"{root}Sentences_Pierre.pkl", 'rb')
sentences_Pierre = pickle.load(f)
sentences_all |= sentences_Pierre 

f = open(F"{root}Sentences_Andrei.pkl", 'rb')
sentences_Andrei = pickle.load(f)
sentences_all |= sentences_Andrei 

f = open(F"{root}Sentences_Kissling.pkl", 'rb')
sentences_Kissling = pickle.load(f)
sentences_all |= sentences_Kissling 

# Drop duplicates
for species, sentences in sentences_all.items():
    sentences_all[species] = list(set(sentences))


In [41]:
sent1 = 'The species is a small shrub'
sent2 = 'life form or shrub'

sent1.split(' ')
jaccard_similarity(sent1.split(' '), sent2.split(' '))

0.1111111111111111

### Match Data

In [36]:
k= 5
google_form_lst = []

for idx, (species, sentences) in enumerate(tqdm(sentences_all.items())):

    # if idx < 400:
    #     continue
    if species != 'Dypsis thiryana':
        continue
    
    df_select = [df_Andrei_dummies, df_Pierre, df_Daniel_merged]
    if species in df_Andrei_dummies.index:
        df_select = df_select[0]
    elif species in df_Pierre.index:
        df_select = df_select[1]
    elif species in df_Daniel_merged.index:
        df_select = df_select[2]
    else:
        raise ValueError

    for gt_main_trait in df_select.columns.get_level_values(0).unique():
        if gt_main_trait == 'Measurement':
            df_subset = df_select[df_select.index == species][gt_main_trait]
            # Not really efficient, use PD?
    
            subtraits = list(df_subset.columns)
            values = df_subset.values[0]
            for sub_trait, value in zip(subtraits, values):

                df_sent  = F"{sub_trait} {value}"

                top_list = []

                for sentence in sentences:
                    gt_sim = jaccard_similarity(sentence, df_sent)
                    top_list.append((gt_sim, sentence))

                top_list.sort(reverse=True)
                top_k_list = [sentence for (_, sentence) in top_list[0:k]]

                google_form_lst.append((species, gt_main_trait, sub_trait, [df_sent], *top_k_list, df_sent.capitalize()))
        else:
            df_subset = df_select[df_select.index == species][gt_main_trait]
            present_traits = df_subset.loc[:, df_subset.any()].columns.values
            df_sent  = ' '.join(gt_main_trait + ' ' + present_traits)

            # print(df_sent)

            top_list = []

            for sentence in sentences:
                gt_sim = jaccard_similarity(sentence, df_sent)
                top_list.append((gt_sim, sentence))

            top_list.sort(reverse=True)

            print(top_list)

            top_k_list = [sentence for (_, sentence) in top_list[0:k]]

            GoogleSent = gt_main_trait + ': ' + ', '.join(list(present_traits))

            google_form_lst.append((species, gt_main_trait, gt_main_trait, list(present_traits), *top_k_list, GoogleSent.capitalize()))

100%|██████████| 647/647 [00:00<00:00, 20737.86it/s]

[(0.5, 'It is about 40cms tall and is more glossy green than it appears in the photos.'), (0.5, 'It has bifurcated , hard to tell from pics.'), (0.5, 'Clustering palm in tufts of 2-4.'), (0.48148148148148145, 'The from is extremely thin, almost less than a pencil, and the leaves are also very widely spaced and much thinner than an lutescens.'), (0.48, 'Clustering palm, in tufts of 2-4.'), (0.48, '.The larger one is soon to have a pair of inflorescence hanging off it as the second begins to emerge.'), (0.4782608695652174, 'A bit concerned as it has a yellowish look'), (0.44, 'This is an extremely thin palm with extremely thin fronds.'), (0.42105263157894735, 'Groundcovers.'), (0.4166666666666667, 'Leaves. pinnate with two or more leaflets.'), (0.39285714285714285, 'Pinnate leaves, leaflets unequally jagged and toothed at apex.'), (0.391304347826087, 'Neither one showing sign of new spear.'), (0.2777777777777778, 'Present.'), (0.2222222222222222, 'Clear.'), (0.1111111111111111, 'Rank: SE




### Create Subset

In [11]:
# Google forms
df_google = pd.DataFrame(google_form_lst, columns=['Species', 'Main Trait', 'GT Main Trait', 'GT Sub Traits', '1', '2', '3', '4', '5', 'GoogleSentence'])
df_google

Unnamed: 0,Species,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5,GoogleSentence
0,Acacia amythethophylla,plant type,plant type,"[Tree, Shrub]",Shrub or tree to 7 m. Thorns in pairs at nodes...,The branch bark is apparently 1.5% approx.,"Perennial, Not climbing, Shrub/Tree.",Flower-heads abundant golden yellow.,"Inflorescence large, terminal, much-branched, ...","Plant type: tree, shrub"
1,Acacia amythethophylla,phyllotaxis,phyllotaxis,[Phyllotaxis alternate],The branch bark is apparently 1.5% approx.,"Pinnae sometimes 20 or more pairs, with 50 or ...","Inflorescence large, terminal, much-branched, ...",Acacia confusa - Stem Smaller peaks suspected ...,Flower-heads abundant golden yellow.,Phyllotaxis: phyllotaxis alternate
2,Acacia amythethophylla,trunk and root,trunk and root,[Base of trunk straight],"Fruits flat, reddish-brown, remaining on the t...","Fruit flat, dehiscent.",Young plants 6-8 months old from seed.,Shrub or tree to 7 m. Thorns in pairs at nodes...,Flower-heads abundant golden yellow.,Trunk and root: base of trunk straight
3,Acacia amythethophylla,latex,latex,[No exudate],Flower-heads abundant golden yellow.,"Fruit flat, dehiscent.",Young plants 6-8 months old from seed.,"Perennial, Not climbing, Shrub/Tree.","Fruits flat, reddish-brown, remaining on the t...",Latex: no exudate
4,Acacia amythethophylla,crown,crown,[Horizontally spreading crowns],"Inflorescence large, terminal, much-branched, ...","Stipules spinescent, stout, brown, glossy, com...","Flowers orange or yellow, strongly and sweetly...",Flower-heads abundant golden yellow.,Acacia confusa - Stem Smaller peaks suspected ...,Crown: horizontally spreading crowns
...,...,...,...,...,...,...,...,...,...,...
13681,Wodyetia bifurcata,Measurement,Minimum Fruit Length,[Minimum Fruit Length 5.0],Mature height averages 25 feet.,Immature green fruit of the Foxtail Palm.,Fruit showing remains of style.,More than 15 m.- Less than 15 m. Thorns.,Fruits : 2 inches long.,Minimum fruit length 5.0
13682,Wodyetia bifurcata,Measurement,Maximum Fruit Length,[Maximum Fruit Length 6.5],Immature green fruit of the Foxtail Palm.,Foxtail Palm Tree: Large Multi Stem.,Orange fallen fruit below the Foxtail Palm.,Mature height averages 25 feet.,Fruit showing remains of style.,Maximum fruit length 6.5
13683,Wodyetia bifurcata,Measurement,Average Fruit Width,[Average Fruit Width 4.35],Mature height averages 25 feet.,Washingtonia robusta Agave americana Scroll to...,Leaf color is green to dull green with no silv...,Fruit on ground below Foxtail.,"Leaves: Pinnate , reduplicate , arching, to 3 ...",Average fruit width 4.35
13684,Wodyetia bifurcata,Measurement,Minimum Fruit Width,[Minimum Fruit Width 2.7],Mature red fruit on Wodyetia.,"Fruit is 2.inches long, deep orange to deep re...",Palm Fruits: Red when ripe.,Fruits : 2 inches long.,"Fruits are 2 inches long, egg shaped, and oran...",Minimum fruit width 2.7


In [12]:
for i in range(10):
    df_google_subset = df_google\
                            .dropna()\
                            .sample(n=20, axis=0, random_state=333+i)

    df_google_subset.to_csv(f'{root}top_sents_all_AllSentencesAgainstTrait_Random20Subset_{i+1}.csv', sep='\t')



# df_google_subset = df_google\
#                     .dropna()\
#                     .sample(n=20, axis=0, random_state=333)\

# df_google_subset