In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn import metrics
import json
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet as wn
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
import collections
from itertools import groupby
import pickle
import re
import matplotlib.colors as mcolors
import warnings
import requests
import csv

### Load the Data

In [2]:
root = "../../data/external/"

# Data Pierre
file_name = root + 'Dataset_Pierre.csv'
df_Pierre = pd.read_csv(file_name, header=[0, 1]) 
df_Pierre = df_Pierre.iloc[: , 1:]
df_Pierre = df_Pierre.set_index(df_Pierre['Species']['species'])

# Data Andrei
file_name = root + 'Dataset_Andrei.csv'
df_Andrei = pd.read_csv(file_name)

# Get Dummies to match DF Pierre
df_Andrei_dummies = pd.get_dummies(df_Andrei.iloc[:, 2:])
# Set species back
df_Andrei_dummies = df_Andrei_dummies.set_index(df_Andrei['Species'])

# Create tuple list for multi index
Andrei_multi_index = []
for top_index in df_Andrei.columns:
    for sub_index in df_Andrei_dummies.columns:
        if top_index in sub_index:

            sub_index = sub_index.split('_')[-1]
            Andrei_multi_index.append((top_index, sub_index))

# Set Mutli index
df_Andrei_dummies.columns = pd.MultiIndex.from_tuples(Andrei_multi_index)

# Data Palms
file_name = root + 'Dataset_Kissling.txt'
df_Daniel = pd.read_csv(file_name,
                 sep='\t', encoding='Latin-1')
palm_species = df_Daniel[~df_Daniel.isnull().any(axis=1)]['SpecName'].values
df_Daniel.set_index('SpecName', inplace=True)

### Helper Functions

In [3]:
def jaccard_similarity(A, B):
    """Calculates the Jaccard similarity two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    A = set(A)
    B = set(B)
    
    # Get intersection of two sets
    nominator = A.intersection(B)

    # Find union of two sets
    denominator = A.union(B)

    # Take the ratio of sizes
    similarity = len(nominator)/len(denominator)

    return similarity

nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}
colors = [color[4:] for color in mcolors.TABLEAU_COLORS.keys()]
colors.extend([color for color in mcolors.CSS4_COLORS.keys()])
colors.extend(
    [
        'whitish', 'bluish', 'reddish', 'greenish', 'backish', 'greyish',
        'backish', 'purplish', 'yellowish', 'orangish', 'brownish', 'pinkish'
    ]
)

def resentesize(lst):

    sentence = ''

    # Loop every 2 items 
    for item1, item2 in zip(lst[::2], lst[1::2]):
        
        # Set verb
        verb = 'has'
        chunk = item2.split(' ')[-1]
        if chunk not in nouns or chunk in colors:
            verb = 'is'

        sentence += f'{item1.capitalize()} {verb} {item2}. '
    
    return sentence


def get_wiki_main_image(title):
    url = 'https://en.wikipedia.org/w/api.php'
    data = {
        'action' :'query',
        'format' : 'json',
        'formatversion' : 2,
        'prop' : 'pageimages|pageterms',
        'piprop' : 'original',
        'titles' : title
    }
    response = requests.get(url, data)
    json_data = json.loads(response.text)
    return json_data['query']['pages'][0]['original']['source'] if len(json_data['query']['pages']) >0 else 'Not found'


## Retokize the Data

In [4]:
# Init empty dict for retok
species_datalist = collections.defaultdict(list)

root = "../../data/processed/"
file_name = root + 'Triples_Pierre.txt'

with open(file_name) as f:
   json_data = json.load(f)

for idx, (k, lst) in enumerate(json_data.items()):
    for l in lst:

        # Flatten list of lists
        flat_list = [item for sublist in l for item in sublist]
        # Retokens some sentence with multiple 'species'
        flat_list_multiple_sents = [list(v) for k, v in groupby(flat_list, lambda x: x != 'species') if k]
        
        for sentence in flat_list_multiple_sents:
            # Insert species
            sentence.insert(0, 'species')
            species_datalist[k].append(sentence)

file_name = root + 'Triples_Andrei.txt'

with open(file_name) as f:
   json_data = json.load(f)

for idx, (k, lst) in enumerate(json_data.items()):
    for l in lst:

        # Flatten list of lists
        flat_list = [item for sublist in l for item in sublist]
        # Retokens some sentence with multiple 'species'
        flat_list_multiple_sents = [list(v) for k, v in groupby(flat_list, lambda x: x != 'species') if k]
        
        for sentence in flat_list_multiple_sents:
            # Insert species
            sentence.insert(0, 'species')
            species_datalist[k].append(sentence)


## Match Data

In [5]:
k= 5
google_form_lst = []

for idx, (species, sentences) in enumerate(tqdm(species_datalist.items())):

    # if idx >= 1:
    #     continue
    
    df_select = [df_Andrei_dummies, df_Pierre]
    if species in df_Andrei_dummies.index:
        df_select = df_select[0]
    elif species in df_Pierre.index:
        df_select = df_select[1]
    else:
        raise ValueError

    for gt_main_trait in df_select.columns.get_level_values(0).unique():
        df_subset = df_select[df_select.index == species][gt_main_trait]
        present_traits = df_subset.loc[:, df_subset.any()].columns.values
        df_sent  = ' '.join(gt_main_trait + ' ' + present_traits)

        top_list = []

        for sentence in sentences:
            # sentence = ' '.join(sentence)
            sentence = resentesize(sentence)
            gt_sim = jaccard_similarity(sentence, df_sent)
            top_list.append((gt_sim, sentence))

        top_list.sort(reverse=True)
        top_k_list = [sentence for (_, sentence) in top_list[0:k]]

        google_form_lst.append((species, gt_main_trait, gt_main_trait, list(present_traits), *top_k_list))

100%|██████████| 401/401 [00:41<00:00,  9.66it/s]


In [6]:
# Google forms
df_google = pd.DataFrame(google_form_lst, columns=['Species', 'Main Trait', 'GT Main Trait', 'GT Sub Traits', '1', '2', '3', '4', '5'])
df_google

Unnamed: 0,Species,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5
0,Acacia amythethophylla,Species,Species,[species],Species is seeds. Seeds has seed. Seed has ceb...,Species has seed. Seed has pod. Pod has seed p...,Species has seed. Seed has dale. Dale has seed...,Species is seeds. Seeds has seed. Seed has sto...,Species is pinnae. Pinnae is pinnae. Pinnae is...
1,Acacia amythethophylla,plant type,plant type,"[Tree, Shrub]",Species has trunk. Trunk has bark. Bark has tr...,Species is soils. Soils has soil. Soil is poor...,Species has plant. Plant has sugar. Sugar has ...,Species has trunk. Trunk has bark. Bark has tr...,Species has fruit. Fruit has fruit. Fruit has ...
2,Acacia amythethophylla,phyllotaxis,phyllotaxis,[Phyllotaxis alternate],Species is soils. Soils has soil. Soil is poor...,Species is plants. Plants has plant. Plant has...,Species has plant. Plant has plant. Plant has ...,Species has plant. Plant has analysis. Analysi...,Species has plant. Plant has analysis. Analysi...
3,Acacia amythethophylla,trunk and root,trunk and root,[Base of trunk straight],Species has stem. Stem has bark. Bark has stem...,Species has trunk. Trunk has bark. Bark has tr...,Species has root. Root has bark. Bark has root...,Species is fruits. Fruits has fruit. Fruit is ...,Species has stem. Stem has bark. Bark has stem...
4,Acacia amythethophylla,latex,latex,[No exudate],Species has plant. Plant has plant. Plant has ...,Species is stipules. Stipules has stipule. Sti...,Species has plant. Plant has plant. Plant has ...,Species has stem. Stem has extraction. Extract...,Species has stem. Stem has bark. Bark has stem...
...,...,...,...,...,...,...,...,...,...
9978,Guaiacum sanctum,Fruit type,Fruit type,[capsule],Species is fruits. Fruits has palm. Palm has f...,Species is flowers. Flowers has flower. Flower...,Species has soil. Soil has soil. Soil has drou...,Species is fruits. Fruits has fruit. Fruit is ...,Species is tissues. Tissues has tissue. Tissue...
9979,Guaiacum sanctum,Fruit shape,Fruit shape,"[3-5-locular, globose]",Species is flowers. Flowers has flower. Flower...,Species is flowers. Flowers has flower. Flower...,Species is flowers. Flowers has flower. Flower...,Species is flowers. Flowers has flower. Flower...,Species is flowers. Flowers has flower. Flower...
9980,Guaiacum sanctum,Fruit colour,Fruit colour,"[yellow, orange]",Species has fruit. Fruit has color. Color has ...,Species is flowers. Flowers has flower. Flower...,Species is flowers. Flowers has flower. Flower...,Species has gametophyte. Gametophyte has gener...,Species has fruit. Fruit has fruit. Fruit has ...
9981,Guaiacum sanctum,Aril colour,Aril colour,[red],Species is plants. Plants has plant. Plant is ...,Species is plants. Plants has plant. Plant is ...,Species has soil. Soil has soil. Soil has drou...,Species is soils. Soils has soil. Soil has dra...,Species has soil. Soil has soil. Soil has drain.


In [7]:
df_google.to_csv(f'{root}top_sents_all_AllSentencesAgainstTrait.csv')

In [8]:
df_google_subset = df_google\
                    .dropna()\
                    .sample(n=20, axis=0, random_state=123)\

df_google_subset

Unnamed: 0,Species,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5
9864,Casearia tremula,Leaf position,Leaf position,[alternate],Species has plant. Plant has plant. Plant has ...,Species is plants. Plants has plant. Plant of ...,Species is seeds. Seeds has seed. Seed is angl...,Species is plants. Plants has plant. Plant has...,Species has style. Style has style. Style is s...
9427,Erythroxylum havanense,Fruit shape,Fruit shape,[elongate],Species is plants. Plants has plant. Plant is ...,Species is plants. Plants has plant. Plant has...,Species is plants. Plants has plant. Plant and...,Species is plants. Plants has plant. Plant and...,Species is plants. Plants has plant. Plant is ...
3594,Fadogia erythrophloea,fruit,fruit,"[Berry, or berry-like fruit]",Species is flowers. Flowers has flower. Flower...,Species is flowers. Flowers has flower. Flower...,Species is buds. Buds has bud. Bud is peduncul...,Species is fruits. Fruits has fruit. Fruit has...,Species is plants. Plants has plant. Plant is ...
7511,Securidaca longipedunculata,leaf margin,leaf margin,[Smooth leaf margin],Species is plants. Plants has plant. Plant of ...,Species has plant. Plant has form. Form has gr...,Species has fruit. Fruit has fruit. Fruit has ...,Species has plant. Plant has plant. Plant is f...,Species is plants. Plants has plant. Plant of ...
2968,Detarium microcarpum,blade color,blade color,[Leaf blade discoloured],Species has seed. Seed has flour. Flour has se...,Species is seeds. Seeds has flour. Flour has s...,Species has seed. Seed has flour. Flour has se...,Species has seed. Seed has flour. Flour has se...,Species has seed. Seed has flour. Flour has se...
2136,Cola cordifolia,leaf blade,leaf blade,"[orbicular blade, lobed blade]",Species has flower. Flower is colour. Colour i...,Species has fruit. Fruit has fruit. Fruit has ...,Species is plants. Plants has plant. Plant has...,Species has fruit. Fruit has fruit. Fruit is s...,Species has fruit. Fruit has fruit. Fruit is r...
3250,Erythrina sigmoidea,Species,Species,[species],Species is cells. Cells has cell. Cell has cem...,Species is cells. Cells has line. Line has cel...,Species has seed. Seed has seed. Seed has coll...,Species has cell. Cell has line. Line has cell...,Species is soils. Soils has soil. Soil is rocky.
8902,Ziziphus abyssinica,trunk and root,trunk and root,[Base of trunk straight],Species has root. Root has bark. Bark has root...,Species has soil. Soil has soil. Soil has deep...,Species is plants. Plants has plant. Plant has...,Species has root. Root has bark. Bark has root...,Species is fruits. Fruits has fruit. Fruit has...
7434,Sarcocephalus latifolius,petiole,petiole,[Petiolated leaf],Species has plant. Plant has plant. Plant has ...,Species has plant. Plant has plant. Plant is t...,Species has plant. Plant has plant. Plant is t...,Species is plants. Plants has plant. Plant is ...,Species is tissues. Tissues has tissue. Tissue...
9497,Libidibia coriaria,Stamen number,Stamen number,[10],Species is plants. Plants has plant. Plant has...,Species is plants. Plants has plant. Plant is ...,Species is plants. Plants has plant. Plant and...,Species is plants. Plants has plant. Plant and...,Species is petals. Petals has petal. Petal is ...


In [9]:
random_sample_species = df_google_subset['Species'].values

images_links = {}
for species in tqdm(random_sample_species):
    species_ = species.replace(' ', '_')
    try:
        img_url = get_wiki_main_image(species_)
        if img_url[-3:] != 'jpg':
            img_url = 'https://upload.wikimedia.org/wikipedia/commons/1/14/No_Image_Available.jpg'
        # if img_url[-3:].isupper():
        #     img_url = 'https://upload.wikimedia.org/wikipedia/commons/1/14/No_Image_Available.jpg'
        images_links[species] = img_url
    except:
        images_links[species] = 'https://upload.wikimedia.org/wikipedia/commons/1/14/No_Image_Available.jpg'

# df_google_subset = df_google[df_google['Species'].isin(random_sample_species)]
df_google_subset = df_google_subset.set_index('Species')
df_google_subset["URL"] = pd.Series(images_links)
df_google_subset.to_csv(f'{root}top_sents_all_AllSentencesAgainstTrait_Random20Subset.csv', sep='\t')#, quoting=csv.QUOTE_ALL)

100%|██████████| 20/20 [00:03<00:00,  5.61it/s]


In [10]:
df_google_subset

Unnamed: 0_level_0,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5,URL
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Casearia tremula,Leaf position,Leaf position,[alternate],Species has plant. Plant has plant. Plant has ...,Species is plants. Plants has plant. Plant of ...,Species is seeds. Seeds has seed. Seed is angl...,Species is plants. Plants has plant. Plant has...,Species has style. Style has style. Style is s...,https://upload.wikimedia.org/wikipedia/commons...
Erythroxylum havanense,Fruit shape,Fruit shape,[elongate],Species is plants. Plants has plant. Plant is ...,Species is plants. Plants has plant. Plant has...,Species is plants. Plants has plant. Plant and...,Species is plants. Plants has plant. Plant and...,Species is plants. Plants has plant. Plant is ...,https://upload.wikimedia.org/wikipedia/commons...
Fadogia erythrophloea,fruit,fruit,"[Berry, or berry-like fruit]",Species is flowers. Flowers has flower. Flower...,Species is flowers. Flowers has flower. Flower...,Species is buds. Buds has bud. Bud is peduncul...,Species is fruits. Fruits has fruit. Fruit has...,Species is plants. Plants has plant. Plant is ...,https://upload.wikimedia.org/wikipedia/commons...
Securidaca longipedunculata,leaf margin,leaf margin,[Smooth leaf margin],Species is plants. Plants has plant. Plant of ...,Species has plant. Plant has form. Form has gr...,Species has fruit. Fruit has fruit. Fruit has ...,Species has plant. Plant has plant. Plant is f...,Species is plants. Plants has plant. Plant of ...,https://upload.wikimedia.org/wikipedia/commons...
Detarium microcarpum,blade color,blade color,[Leaf blade discoloured],Species has seed. Seed has flour. Flour has se...,Species is seeds. Seeds has flour. Flour has s...,Species has seed. Seed has flour. Flour has se...,Species has seed. Seed has flour. Flour has se...,Species has seed. Seed has flour. Flour has se...,https://upload.wikimedia.org/wikipedia/commons...
Cola cordifolia,leaf blade,leaf blade,"[orbicular blade, lobed blade]",Species has flower. Flower is colour. Colour i...,Species has fruit. Fruit has fruit. Fruit has ...,Species is plants. Plants has plant. Plant has...,Species has fruit. Fruit has fruit. Fruit is s...,Species has fruit. Fruit has fruit. Fruit is r...,https://upload.wikimedia.org/wikipedia/commons...
Erythrina sigmoidea,Species,Species,[species],Species is cells. Cells has cell. Cell has cem...,Species is cells. Cells has line. Line has cel...,Species has seed. Seed has seed. Seed has coll...,Species has cell. Cell has line. Line has cell...,Species is soils. Soils has soil. Soil is rocky.,https://upload.wikimedia.org/wikipedia/commons...
Ziziphus abyssinica,trunk and root,trunk and root,[Base of trunk straight],Species has root. Root has bark. Bark has root...,Species has soil. Soil has soil. Soil has deep...,Species is plants. Plants has plant. Plant has...,Species has root. Root has bark. Bark has root...,Species is fruits. Fruits has fruit. Fruit has...,https://upload.wikimedia.org/wikipedia/commons...
Sarcocephalus latifolius,petiole,petiole,[Petiolated leaf],Species has plant. Plant has plant. Plant has ...,Species has plant. Plant has plant. Plant is t...,Species has plant. Plant has plant. Plant is t...,Species is plants. Plants has plant. Plant is ...,Species is tissues. Tissues has tissue. Tissue...,https://upload.wikimedia.org/wikipedia/commons...
Libidibia coriaria,Stamen number,Stamen number,[10],Species is plants. Plants has plant. Plant has...,Species is plants. Plants has plant. Plant is ...,Species is plants. Plants has plant. Plant and...,Species is plants. Plants has plant. Plant and...,Species is petals. Petals has petal. Petal is ...,https://upload.wikimedia.org/wikipedia/commons...
