In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn import metrics
import json
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet as wn
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
import collections
from itertools import groupby
import pickle
import re
import matplotlib.colors as mcolors
import warnings
import requests
import csv
from sklearn.preprocessing import MultiLabelBinarizer
from functools import reduce

### Global Variables & Functions

#### Variables

In [2]:
nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}

colors = [color[4:] for color in mcolors.TABLEAU_COLORS.keys()]
colors.extend([color for color in mcolors.CSS4_COLORS.keys()])
colors.extend(
    [
        'whitish', 'bluish', 'reddish', 'greenish', 'backish', 'greyish',
        'backish', 'purplish', 'yellowish', 'orangish', 'brownish', 'pinkish'
    ]
)



traits_dict = {
    'Life Form':
    [
        'Tree', 'Shrub', 'Bush', 'Ficus', 'Strangler', 'Liana', 'Parasitic', 'Palm', 'Herbaceous'
    ],
    'Trunk':
    [
        'Trunk', 'Straight', 'Flared', 'Foothills', 'Silt', 'Aerial'
    ],
    'Root':
    [
        'Root', 'Straight', 'Flared', 'Foothills', 'Silt', 'Aerial'
    ],
    'Latex':
    [
        'Latex'
    ],
    'Phyllotaxis': # Leaf Position
    [
        'Phyllotaxis', 'Alternate', 'Whorled', 'Whorls', 'Opposite'
    ],
    'Leaf Composition':
    [
        'Palmate', 'Pinnate', 'Entire', 'Bi-pinnate'
    ],
    'Crown':
    [
        'Crown'
    ],
    'Stem':
    [
        'Stem', 'Circular', 'Square'
    ],
    'Bark':
    [
        'Bark'
    ],
    'Bark Colour':
    [
        'Bark'
    ],
    'Leaf Shape':
    [
        'Simple', 'Bifoliate', 'Trifoliate', 'Digitized', 'Paripinnate', 'Unipinnate', 'Imperipinnate', 
        'Alternate', 'Bipinnate', 'Pinnate', 'Elliptic', 'Elongate', 'Ovate', 'Round', 'Obovate', 'Lanceolate',
        'Kidney-shaped', 'Heart-shaped', 'Spathulate'
    ],
    'Petiole':
    [
        'Petiole', 'Sessile', 'Petiolated', 'Canaliculate', 'Glands', 'Glandular', 
     'Winged' 'Wings', 'Hairs', 'Hair', 'Translucent'
     ],
    'Leaf Colour':
    [
        'Leaf Colour', 'Leaf Color'
    ],
    'Leaf Blade':
    [
        'Leaf Blade', 'Linear', 'Lanceolate', 'Elliptical', 'Obovate', 'Obtriangular', 
        'Obtriangular', 'Asymmetrical', 'Orbicular', 'Bilobed', 'Lobed', 'Lobes', 'Lobe'
    ],
    'Leaf Base':
    [
        'Leaf Base', 'Rounded', 'Cordate', 'Glands'
    ],
    'Leaf Margin':
    [
        'Margin', 'Smooth', 'Wavy', 'Crenate', 'Toothed', 'Teeth', 'Crenate', 'Serrate'
    ],
    'Leaf Apex':
    [
        'Apex', 'Acuminate', 'Apiculate', 'Mucronate', 'Rounded', 'Emarginated'
    ],
    'Leaf side':
    [
        'Glabrous', 'Pubescent', 'Salt Crystals', 'Scales', 'Woolly', 'Powdery'
    ],
    'Leaf glands':
    [
        'Glands', 'Gland', 'Translucent'
    ],
    'Rachis':
    [
        'Rachis', 'Winged'
    ],
    'Vein':
    [
        'Vein'
    ],
    'Tendril':
    [
        'Tendril'
    ],
    'Spine':
    [
        'Spine', 'Prickle', 'Spines', 'Prickles'
    ],
    'Thornes':
    [
        'Thorn', 'Thornes'
    ],
    'Blade Colour':
    [
        'Blade'
    ],
    'Fruit':
    [
        'Drupe', 'Berry', 'Capsule', 'Pod', 'Follicle', 'Achene', 'Winged', 'Follicle',
        'Pod', 'Nutlet', 'Fruit'
    ],
    'Fruit Shape':
    [
        'locular', 'Globose', 'Flattened', 'Elongate', 'Obovoid', 'Ovate', 'Twisted',
        'Curved', 'Pyriform', 'Ovoid'
    ],
    'Fruit Colour':
    [
        'Fruit'
    ],
    'Inflorescences':
    [
        'Inflorescences', 'Inflorescence', 'Sessile', 'Panicle', 'Flower head', 'Cyme', 'Glomerule', 
        'Fascicle', 'Umbel', 'Corymb', 'Rootlet', 'Spike', 'Dichasium', 'Fascicle',
        'Globose', 'Raceme', 'Fascicle', 'Umbel'
     ],
    'Sexuality':
    [
        'Sexuality', 'Axillary', 'Terminal'
    ],
    'Flower Colour':
    [
        'Flower colour', 'Flower color', 'Flower', 'Flowers'
    ],
    'Flower Shape':
    [
        'Flower shape', 'Petalled', 'Petal', 'Petals', 'Tubular', 'Apetal', 'Butterfly-shaped', 'Shaped', 'Flower', 'Flowers'
    ],
    'Sepal Shape':
    [
        'Sepal', 'Sepals', 'Connate'
    ],
    'Petal Shape':
    [
        'Petal', 'Petals', 'Tepals', 'Tepal', 'Tubular'
    ],
    'Aril Colour':
    [
        'Aril'
    ],
    'Seed Colour':
    [
        'Seed', 
    ],
    'Conspicuousness':
    [
        'Conspicuousness', 'Cryptic'
    ]
}

with open('../../data/supportive/traits_Pierre_and_Andrei.json', 'w') as f:
    json.dump(traits_dict, f)
with open('../../data/supportive/colour_list.json', 'w') as f:
    json.dump(colors, f)

traits_list = list(traits_dict.keys())
traits_list += [trait.lower() for lst in list(traits_dict.values()) for trait in lst]

#### Functions

In [3]:
def corresponding_keys(val, dictionary):
    """returns the corresponding key of a single value 
    assuming the values are lists.

    Args:
        val (string): string present in the dict
        dictionary (dict): dict with lists of stings as values

    Returns:
        list: list of matching keys
    """
    # Init list
    keys = []
    # Search the dict
    for k, v in dictionary.items():
        if val in v:
            keys.append(k)
    return keys
    
def jaccard_similarity(A, B):
    """Calculates the Jaccard similarity two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    A = set(A)
    B = set(B)
    
    # Get intersection of two sets
    nominator = A.intersection(B)

    # Find union of two sets
    denominator = A.union(B)

    # Take the ratio of sizes
    similarity = len(nominator)/len(denominator)

    return similarity


def similarity(groundtruth, pred):
    """Calculates the normal similarity between two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    groundtruth = set(groundtruth)
    pred = set(pred)

    # Find intersection of two sets
    nominator = groundtruth.intersection(pred)

    # Find union of two sets
    denominator = groundtruth

    # Take the ratio of sizes
    similarity = len(nominator)/(len(denominator))

    return similarity

def resentesize(lst):

    sentence = ''

    # Loop every 2 items 
    for item1, item2 in zip(lst[::2], lst[1::2]):
        
        # Set verb
        verb = 'has'
        chunk = item2.split(' ')[-1]
        if chunk not in nouns or chunk in colors:
            verb = 'is'

        sentence += f'{item1.capitalize()} {verb} {item2}. '
    
    return sentence


def get_wiki_main_image(title):
    url = 'https://en.wikipedia.org/w/api.php'
    data = {
        'action' :'query',
        'format' : 'json',
        'formatversion' : 2,
        'prop' : 'pageimages|pageterms',
        'piprop' : 'original',
        'titles' : title
    }
    response = requests.get(url, data)
    json_data = json.loads(response.text)
    return json_data['query']['pages'][0]['original']['source'] if len(json_data['query']['pages']) >0 else 'Not found'


### Load GT Data

In [4]:
root = "../../data/external/"

#### Data Andrei

In [5]:
file_name = root + 'Dataset_Andrei.csv'
df_Andrei = pd.read_csv(file_name)

# Get Dummies to match DF Pierre
df_Andrei_dummies = pd.get_dummies(df_Andrei.iloc[:, 2:])
# Set species back
df_Andrei_dummies = df_Andrei_dummies.set_index(df_Andrei['Species'])

# Create tuple list for multi index
Andrei_multi_index = []
for top_index in df_Andrei.columns:
    for sub_index in df_Andrei_dummies.columns:
        if top_index in sub_index:

            sub_index = sub_index.split('_')[-1]
            Andrei_multi_index.append((top_index, sub_index))

# Set Mutli index
df_Andrei_dummies.columns = pd.MultiIndex.from_tuples(Andrei_multi_index)
df_Andrei_dummies

Unnamed: 0_level_0,Life form,Life form,Leaf position,Leaf position,Leaf position,Leaf position,Leaf position,Leaf composition,Leaf composition,Leaf composition,...,Aril colour,Aril colour,Aril colour,Aril colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour
Unnamed: 0_level_1,liana,tree,alternate,"alternate, opposite",opposite,"opposite, whorls of 3","opposite, whorls of 3, alternate",3 palmate,3-5 palmate,3-5 pinnate,...,orange,red,white,yellow-geen,black,brown,green,grey,white,whitish
Species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Avicennia germinans,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Metopium brownei,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Handroanthus billbergii,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
Bourreria succulenta,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bursera karsteniana,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bursera simaruba,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bursera tomentosa,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cynophalla flexuosa,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
Cynophalla hastata,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
Quadrella indica,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


#### Data Pierre

In [6]:
file_name = root + 'Dataset_Pierre.csv'
df_Pierre = pd.read_csv(file_name, header=[0, 1]) 
df_Pierre = df_Pierre.iloc[: , 1:]
df_Pierre = df_Pierre.set_index(df_Pierre['Species']['species'])
df_Pierre

Unnamed: 0_level_0,Species,plant type,plant type,plant type,plant type,plant type,plant type,plant type,plant type,phyllotaxis,...,flower color,flower color,flower shape,flower shape,flower shape,flower shape,flower shape,flower shape,flower shape,flower shape
Unnamed: 0_level_1,species,Tree,Shrub,Bush,Ficus strangler,Liana,Parasitic,Palm tree,Herbaceous,Phyllotaxis alternate,...,Yellow coloured flower,Green coloured flower,Single-petalled flower,Three-petalled flower,Four-petalled flower,Five-petalled flower,Flower with 6 or more petals,Tubular flower,Butterfly-shaped flower,Apetal flower
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Acacia amythethophylla,Acacia amythethophylla,1,1,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,1,0,0
Acacia ataxacantha,Acacia ataxacantha,1,1,1,0,1,0,0,0,1,...,1,0,0,0,0,1,0,1,0,0
Acacia dudgeoni,Acacia dudgeoni,1,1,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,1,0,0
Acacia ehrenbergiana,Acacia ehrenbergiana,1,1,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,1,0,0
Acacia erythrocalyx,Acacia erythrocalyx,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ziziphus abyssinica,Ziziphus abyssinica,1,1,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
Ziziphus lotus ssp. saharae,Ziziphus lotus ssp. saharae,0,1,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
Ziziphus mauritiana,Ziziphus mauritiana,1,1,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
Ziziphus mucronata,Ziziphus mucronata,1,1,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0


#### Data Palms

In [7]:
file_name = root + 'Dataset_Kissling.txt'
df_Daniel = pd.read_csv(file_name,
                 sep='\t', encoding='Latin-1')
palm_species = df_Daniel[~df_Daniel.isnull().any(axis=1)]['SpecName'].values
df_Daniel.set_index('SpecName', inplace=True)

### RESHAPE DATA FOR PALMS

# Exclude string types
df_Daniels_int = df_Daniel.select_dtypes(exclude=[object])
df_Daniels_str = df_Daniel.select_dtypes(include=[object])
# Drop numbers
df_Daniels_semi_ints = df_Daniels_int.loc[:, df_Daniels_int.max() <= 3]
# Merge again
df_Daniel_edit = pd.merge(df_Daniels_str, df_Daniels_semi_ints, left_index=True, right_index=True)

# Get colors as lst of lsts 
FruitColorDescription_colors_lst = []

for palm_colors in df_Daniels_str['FruitColorDescription'].values:
    if type(palm_colors) == str:
        #print(type(colors))
        palm_colors = re.split(r'; |to | |-', palm_colors)

        #print(palm_colors)
        FruitColorDescription_colors_lst.append([color for color in palm_colors if color in colors])
    else:
        FruitColorDescription_colors_lst.append([])

MainFruitColors_colors_lst = []

for palm_colors in df_Daniels_str['MainFruitColors'].values:
    if type(palm_colors) == str:
        #print(type(colors))
        palm_colors = re.split(r'; |to | |-', palm_colors)

        #print(palm_colors)
        MainFruitColors_colors_lst.append([color for color in palm_colors if color in colors])
    else:
        MainFruitColors_colors_lst.append([])

# Init SKlearn MLB
mlb = MultiLabelBinarizer()

# Create dummies for color columns
df_FruitColorDescription = pd.DataFrame(
    {
        'FruitColorDescription': FruitColorDescription_colors_lst
    }, columns=['FruitColorDescription'])

s = df_FruitColorDescription['FruitColorDescription']
df_FruitColorDescription = pd.DataFrame(mlb.fit_transform(s),columns=mlb.classes_, index=df_Daniel.index)

# Multiindex columns
columns = [('FruitColorDescription', column) for column in df_FruitColorDescription.columns]
df_FruitColorDescription.columns = pd.MultiIndex.from_tuples(columns)

# Create dummies for color columns
df_MainFruitColors = pd.DataFrame(
    {
        'MainFruitColors': MainFruitColors_colors_lst
    }, columns=['MainFruitColors'])

s = df_MainFruitColors['MainFruitColors']
df_MainFruitColors = pd.DataFrame(mlb.fit_transform(s),columns=mlb.classes_, index=df_Daniel.index)

# Multiindex columns
columns = [('Fruit Colour', column) for column in df_MainFruitColors.columns]
df_MainFruitColors.columns = pd.MultiIndex.from_tuples(columns)

df_Daniels_str_non_color = df_Daniels_str[['UnderstoreyCanopy', 'FruitSizeCategorical', 'FruitShape', 'Conspicuousness']]

# df_Daniels_str_non_color.columns = pd.MultiIndex.from_tuples(
#     [
#         ('Crown', 'UnderstoreyCanopy'),
#         ('Fruit Size', 'FruitSizeCategorical'),
#         ('Fruit Shape', 'FruitShape'),
#         ('Conspicuousness', 'Conspicuousness'),
#     ]
# )

df_Daniels_str_non_color_dummies = pd.get_dummies(df_Daniels_str_non_color)
columns = []
for column in df_Daniels_str_non_color_dummies.columns:
    level0, level1 = column.split('_')
    if level0 == 'UnderstoreyCanopy':
        level0 = 'Crown'
    elif level0 == 'FruitSizeCategorical':
        level0 = 'Fruit Size'
    elif level0 == 'FruitShape':
        level0 = 'Fruit Shape'
    elif level0 == 'Conspicuousness':
        level0 = 'Conspicuousness'
    columns.append((level0, level1))
    
df_Daniels_str_non_color_dummies.columns = pd.MultiIndex.from_tuples(columns)

### JOIN ALL DATA
data_frames = [df_FruitColorDescription, df_MainFruitColors, df_Daniels_str_non_color_dummies]
df_Daniel_merged = pd.concat(data_frames, axis=1)
df_Daniel_merged

Unnamed: 0_level_0,FruitColorDescription,FruitColorDescription,FruitColorDescription,FruitColorDescription,FruitColorDescription,FruitColorDescription,FruitColorDescription,FruitColorDescription,FruitColorDescription,FruitColorDescription,...,Fruit Size,Fruit Shape,Fruit Shape,Fruit Shape,Fruit Shape,Fruit Shape,Fruit Shape,Fruit Shape,Conspicuousness,Conspicuousness
Unnamed: 0_level_1,black,blue,bluish,brown,brownish,chocolate,coral,crimson,darkgreen,green,...,small,ellipsoid,elongate,fusiform,globose,ovoid,pyramidal,rounded,conspicuous,cryptic
SpecName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Acanthophoenix crinita,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
Acanthophoenix rousselii,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
Acanthophoenix rubra,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
Acoelorrhaphe wrightii,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
Acrocomia aculeata,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wettinia quinaria,0,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,1
Wettinia radiata,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
Wettinia verruculosa,0,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,1
Wodyetia bifurcata,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


## Prediction Data

### Load Data

In [28]:
root = "../../data/processed/"
sentences_all = {}

f = open(F"{root}Sentences_Pierre.pkl", 'rb')
sentences_Pierre = pickle.load(f)
sentences_all |= sentences_Pierre 

f = open(F"{root}Sentences_Andrei.pkl", 'rb')
sentences_Andrei = pickle.load(f)
sentences_all |= sentences_Andrei 

f = open(F"{root}Sentences_Kissling.pkl", 'rb')
sentences_Kissling = pickle.load(f)
sentences_all |= sentences_Kissling 

# Drop duplicates
for species, sentences in sentences_all.items():
    sentences_all[species] = list(set(sentences))


### Match Data

In [36]:
k= 5
google_form_lst = []

for idx, (species, sentences) in enumerate(tqdm(sentences_all.items())):

    # if idx >= 1:
    #     continue
    
    df_select = [df_Andrei_dummies, df_Pierre, df_Daniel_merged]
    if species in df_Andrei_dummies.index:
        df_select = df_select[0]
    elif species in df_Pierre.index:
        df_select = df_select[1]
    elif species in df_Daniel_merged.index:
        df_select = df_select[2]
    else:
        raise ValueError

    for gt_main_trait in df_select.columns.get_level_values(0).unique():
        df_subset = df_select[df_select.index == species][gt_main_trait]
        present_traits = df_subset.loc[:, df_subset.any()].columns.values
        df_sent  = ' '.join(gt_main_trait + ' ' + present_traits)

        top_list = []

        for sentence in sentences:
            gt_sim = jaccard_similarity(sentence, df_sent)
            top_list.append((gt_sim, sentence))

        top_list.sort(reverse=True)
        top_k_list = [sentence for (_, sentence) in top_list[0:k]]

        google_form_lst.append((species, gt_main_trait, gt_main_trait, list(present_traits), *top_k_list))

100%|██████████| 647/647 [00:27<00:00, 23.15it/s]


### Create Subset

In [37]:
# Google forms
df_google = pd.DataFrame(google_form_lst, columns=['Species', 'Main Trait', 'GT Main Trait', 'GT Sub Traits', '1', '2', '3', '4', '5'])
df_google

Unnamed: 0,Species,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5
0,Acacia amythethophylla,Species,Species,[species],Acacia confusa - Stem Smaller peaks suspected ...,"Stipules spinescent, stout, brown, glossy, com...",Acacia confusa - Root Smaller peaks suspected ...,Shrub or tree to 7 m. Thorns in pairs at nodes...,"Fruit flat, dehiscent."
1,Acacia amythethophylla,plant type,plant type,"[Tree, Shrub]",Shrub or tree to 7 m. Thorns in pairs at nodes...,The branch bark is apparently 1.5% approx.,"Perennial, Not climbing, Shrub/Tree.",Flower-heads abundant golden yellow.,"Inflorescence large, terminal, much-branched, ..."
2,Acacia amythethophylla,phyllotaxis,phyllotaxis,[Phyllotaxis alternate],The branch bark is apparently 1.5% approx.,"Pinnae sometimes 20 or more pairs, with 50 or ...","Inflorescence large, terminal, much-branched, ...",Acacia confusa - Stem Smaller peaks suspected ...,Flower-heads abundant golden yellow.
3,Acacia amythethophylla,trunk and root,trunk and root,[Base of trunk straight],"Fruits flat, reddish-brown, remaining on the t...","Fruit flat, dehiscent.",Young plants 6-8 months old from seed.,Shrub or tree to 7 m. Thorns in pairs at nodes...,Flower-heads abundant golden yellow.
4,Acacia amythethophylla,latex,latex,[No exudate],Flower-heads abundant golden yellow.,"Fruit flat, dehiscent.",Young plants 6-8 months old from seed.,"Perennial, Not climbing, Shrub/Tree.","Fruits flat, reddish-brown, remaining on the t..."
...,...,...,...,...,...,...,...,...,...
11437,Wodyetia bifurcata,Fruit Colour,Fruit Colour,[red],Fruits close up.,Fruit on ground below Foxtail.,Foxtail Palm Tree: Medium.,Fruits : 2 inches long.,"Fruits are 2 inches long, egg shaped, and oran..."
11438,Wodyetia bifurcata,Crown,Crown,[canopy],Capi Nature Row Egg Antraciet.,Leaf scars are regular and closely spaced.,Leaves are very plumose with leaflets coming o...,The crownshaft of the foxtail palm is light to...,Nous contacter.
11439,Wodyetia bifurcata,Fruit Size,Fruit Size,[large],Fruit on ground below Foxtail.,Foxtail Palm Tree: Large Multi Stem.,Immature green fruit of the Foxtail Palm.,Fruit showing remains of style.,Foxtail Palm Tree: Small.
11440,Wodyetia bifurcata,Fruit Shape,Fruit Shape,[globose],Washingtonia robusta Agave americana Scroll to...,Leaflets are thin and about a foot long.,Fruits close up.,"Fruits are 2 inches long, egg shaped, and oran...",The fruits grow plentiful on the stalk.


In [38]:
df_google_subset = df_google\
                    .dropna()\
                    .sample(n=20, axis=0, random_state=123)\

df_google_subset

Unnamed: 0,Species,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5
5891,Manilkara multinervis,tendril,tendril,[Plant without tendrils],Clustered white or brown flowers amongst them.,Leaves clustered towards apices of branchlets.,Clustered white or brown flowers amongst them ...,Mature seed not known.,Young branchlets deep purplish-brown with pale...
10334,Chamaedorea simplex,Crown,Crown,[understorey],"The flowers are produced in inflorescences , t...","General Information Plants small, usually low-...",Flowers sessile or partly enclosed in a cavity...,A small understory palm with simple undivied l...,"Flowers unisexual, sessile, staminate and pist..."
10154,Bactris ptariana,Crown,Crown,[understorey],Groundcovers.,"All species have spiny leaves, the spines are ...","Seed irregularly globular, basally attached, h...","Diminutive to large, solitary or clustered, un...","Stems subterranean and very short, to erect, v..."
5548,Leptadenia pyrotechnica,flower color,flower color,[Yellow coloured flower],The roots reach up to 12 m below the surface.,Leptadenia pyrotechnica flower Flower color ye...,The flowers are fine-haired and have a minimal...,"Branches erect, slender, green when young, bro...",Flowers in small umbellate cymes.
6065,Monotes kerstingii,vein,vein,[Pinnate venation with secondary veins connect...,"Inflorescence an axillary or terminal, short p...",Fruit an ovoid to globose nut,"Seed ovoid, c. 5.5 mm long, brown.",Botany Shrub or small to medium-sized tree...,"Flowers bisexual, regular, 5-merous, pedicel s..."
2522,Commiphora pedunculata,sexuality,sexuality,[Axillary sexuality],Flowers in axillary paniculate cymes up to 5 cm.,Stamen-filaments subterete.,"Petals densely appressed-pilose outside, up to...",Leaves usually crowded at the ends of the yell...,The lateral branchlets occasionally hardening ...
3175,Entada abyssinica,Species,Species,[species],The long spikes of flowers are cream to yellow.,Sperm with apoptotic nucleus characterized by ...,"Small tree, without thorns.",Flowers creamy-white.,A seed sprouting up slowly.
2509,Commiphora pedunculata,leaf shape,leaf shape,[Unipinnate leaf with opposite leaflets],Leaves usually crowded at the ends of the yell...,The lateral branchlets occasionally hardening ...,Flowers in axillary paniculate cymes up to 5 cm.,An Commiphora pedunculata in nahilalakip ha ge...,"Flowers in dense, subglobose, hispid clusters ..."
1292,Boscia angustifolia,spine,spine,[Plant without spines],These leaves are narrow lanceolate or linear ...,"Morphology Flowers greenish, fragrant.","Flowers in dense branched terminal heads, smal...","Twigs glabrous or initially densely pubescent,...","Flowers are greenish to white, terminal or axi..."
5465,Lawsonia inermis,vein,vein,[Pinnate venation with secondary veins connect...,"Lawsonia inermis , commonly called henna, is a...",Terminal panicles of highly fragrant flowers b...,"The fruits are small, brown globose capsule, o...","The fruit is usually a capsule, the testa of a...",The leaves gradually yellow and fall during pr...


### Add Images

In [39]:
random_sample_species = df_google_subset['Species'].values

images_links = {}
for species in tqdm(random_sample_species):
    species_ = species.replace(' ', '_')
    try:
        img_url = get_wiki_main_image(species_)
        if img_url[-3:] != 'jpg':
            img_url = 'https://upload.wikimedia.org/wikipedia/commons/1/14/No_Image_Available.jpg'
        # if img_url[-3:].isupper():
        #     img_url = 'https://upload.wikimedia.org/wikipedia/commons/1/14/No_Image_Available.jpg'
        images_links[species] = img_url
    except:
        images_links[species] = 'https://upload.wikimedia.org/wikipedia/commons/1/14/No_Image_Available.jpg'

# df_google_subset = df_google[df_google['Species'].isin(random_sample_species)]
df_google_subset = df_google_subset.set_index('Species')
df_google_subset["URL"] = pd.Series(images_links)
df_google_subset.to_csv(f'{root}top_sents_all_AllSentencesAgainstTrait_Random20Subset.csv', sep='\t')#, quoting=csv.QUOTE_ALL)

100%|██████████| 20/20 [00:03<00:00,  5.53it/s]


In [40]:
df_google_subset

Unnamed: 0_level_0,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5,URL
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Manilkara multinervis,tendril,tendril,[Plant without tendrils],Clustered white or brown flowers amongst them.,Leaves clustered towards apices of branchlets.,Clustered white or brown flowers amongst them ...,Mature seed not known.,Young branchlets deep purplish-brown with pale...,https://upload.wikimedia.org/wikipedia/commons...
Chamaedorea simplex,Crown,Crown,[understorey],"The flowers are produced in inflorescences , t...","General Information Plants small, usually low-...",Flowers sessile or partly enclosed in a cavity...,A small understory palm with simple undivied l...,"Flowers unisexual, sessile, staminate and pist...",https://upload.wikimedia.org/wikipedia/commons...
Bactris ptariana,Crown,Crown,[understorey],Groundcovers.,"All species have spiny leaves, the spines are ...","Seed irregularly globular, basally attached, h...","Diminutive to large, solitary or clustered, un...","Stems subterranean and very short, to erect, v...",https://upload.wikimedia.org/wikipedia/commons...
Leptadenia pyrotechnica,flower color,flower color,[Yellow coloured flower],The roots reach up to 12 m below the surface.,Leptadenia pyrotechnica flower Flower color ye...,The flowers are fine-haired and have a minimal...,"Branches erect, slender, green when young, bro...",Flowers in small umbellate cymes.,https://upload.wikimedia.org/wikipedia/commons...
Monotes kerstingii,vein,vein,[Pinnate venation with secondary veins connect...,"Inflorescence an axillary or terminal, short p...",Fruit an ovoid to globose nut,"Seed ovoid, c. 5.5 mm long, brown.",Botany Shrub or small to medium-sized tree...,"Flowers bisexual, regular, 5-merous, pedicel s...",https://upload.wikimedia.org/wikipedia/commons...
Commiphora pedunculata,sexuality,sexuality,[Axillary sexuality],Flowers in axillary paniculate cymes up to 5 cm.,Stamen-filaments subterete.,"Petals densely appressed-pilose outside, up to...",Leaves usually crowded at the ends of the yell...,The lateral branchlets occasionally hardening ...,https://upload.wikimedia.org/wikipedia/commons...
Entada abyssinica,Species,Species,[species],The long spikes of flowers are cream to yellow.,Sperm with apoptotic nucleus characterized by ...,"Small tree, without thorns.",Flowers creamy-white.,A seed sprouting up slowly.,https://upload.wikimedia.org/wikipedia/commons...
Commiphora pedunculata,leaf shape,leaf shape,[Unipinnate leaf with opposite leaflets],Leaves usually crowded at the ends of the yell...,The lateral branchlets occasionally hardening ...,Flowers in axillary paniculate cymes up to 5 cm.,An Commiphora pedunculata in nahilalakip ha ge...,"Flowers in dense, subglobose, hispid clusters ...",https://upload.wikimedia.org/wikipedia/commons...
Boscia angustifolia,spine,spine,[Plant without spines],These leaves are narrow lanceolate or linear ...,"Morphology Flowers greenish, fragrant.","Flowers in dense branched terminal heads, smal...","Twigs glabrous or initially densely pubescent,...","Flowers are greenish to white, terminal or axi...",https://upload.wikimedia.org/wikipedia/commons...
Lawsonia inermis,vein,vein,[Pinnate venation with secondary veins connect...,"Lawsonia inermis , commonly called henna, is a...",Terminal panicles of highly fragrant flowers b...,"The fruits are small, brown globose capsule, o...","The fruit is usually a capsule, the testa of a...",The leaves gradually yellow and fall during pr...,https://upload.wikimedia.org/wikipedia/commons...
