In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn import metrics
import json
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet as wn
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
import collections
from itertools import groupby
import pickle
import re
import matplotlib.colors as mcolors
import warnings
import requests
import csv
from sklearn.preprocessing import MultiLabelBinarizer
from functools import reduce
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

### Global Variables & Functions

#### Variables

In [3]:
nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}

with open('../../data/supportive/colour_list.json', 'r') as f:
  colors = json.load(f)

with open('../../data/supportive/trait_dictionary.json', 'r') as f:
  traits_dict = json.load(f)
traits_list = list(traits_dict.keys())
traits_list += [trait.lower() for lst in list(traits_dict.values()) for trait in lst]

#### Functions

In [4]:
def corresponding_keys(val, dictionary):
    """returns the corresponding key of a single value 
    assuming the values are lists.

    Args:
        val (string): string present in the dict
        dictionary (dict): dict with lists of stings as values

    Returns:
        list: list of matching keys
    """
    # Init list
    keys = []
    # Search the dict
    for k, v in dictionary.items():
        if val in v:
            keys.append(k)
    return keys
    
def jaccard_similarity(A, B):
    """Calculates the Jaccard similarity two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    A = set(A)
    B = set(B)
    
    # Get intersection of two sets
    nominator = A.intersection(B)

    # Find union of two sets
    denominator = A.union(B)

    # Take the ratio of sizes
    similarity = len(nominator)/len(denominator)

    return similarity


def similarity(groundtruth, pred):
    """Calculates the normal similarity between two sets.

    Args:
        A (Set): Set A
        B (Set): Set B

    Returns:
        Integer: 0.00 - 1.00
    """

    # Create sets just in case
    groundtruth = set(groundtruth)
    pred = set(pred)

    # Find intersection of two sets
    nominator = groundtruth.intersection(pred)

    # Find union of two sets
    denominator = groundtruth

    # Take the ratio of sizes
    similarity = len(nominator)/(len(denominator))

    return similarity

def resentesize(lst):

    sentence = ''

    # Loop every 2 items 
    for item1, item2 in zip(lst[::2], lst[1::2]):
        
        # Set verb
        verb = 'has'
        chunk = item2.split(' ')[-1]
        if chunk not in nouns or chunk in colors:
            verb = 'is'

        sentence += f'{item1.capitalize()} {verb} {item2}. '
    
    return sentence


def get_wiki_main_image(title):
    url = 'https://en.wikipedia.org/w/api.php'
    data = {
        'action' :'query',
        'format' : 'json',
        'formatversion' : 2,
        'prop' : 'pageimages|pageterms',
        'piprop' : 'original',
        'titles' : title
    }
    response = requests.get(url, data)
    json_data = json.loads(response.text)
    return json_data['query']['pages'][0]['original']['source'] if len(json_data['query']['pages']) >0 else 'Not found'


def match_trait_against_DF(trait, df, which_df):
    """Return the corresponding traits

    Args:
        trait (string): The trait used
        df (DataFrame): The inserted DataFrame
        which_df (string): Which of the three DataFrames to use. 
                           Choices are ['Pierre', 'Andrei', 'Kissling']

    Raises:
        ValueError: If a DF other then ['Pierre', 'Andrei', 'Kissling']
                    is used.

    Returns:
        _type_: _description_
    """

    # Error
    df_choices = ['Pierre', 'Andrei', 'Kissling']
    if which_df not in df_choices:
        raise ValueError(f'Choose one of {df_choices}')

    column = ''
    if which_df == 'Pierre':
        if trait == 'Life Form':
            column = 'plant type'
        elif trait == 'Trunk':
            column = 'trunk and root'
        elif trait == 'Root':
            column = 'trunk and root'
        elif trait == 'Stem':
            column = 'stem shape'
        elif trait == 'Bark Colour':
            column = 'bark color'
        elif trait == 'Blade Colour':
            column = 'blade color'
        elif trait == 'Flower Colour':
            column = 'flower color'
        else:
            column = trait.lower()

    if which_df == 'Andrei':
        if trait == 'Phyllotaxis':
            column = ['Leaf position', 'Leaf upper side', 'Leaf lower side']
        elif trait == 'Spine':
            column = 'Thorns/spines'
        elif trait == 'Thornes':
            column = 'Thorns/spines'
        elif trait == 'Fruit':
            column = 'Fruit type'
        else:
            column = trait.lower()

            
    
    return column


### Load GT Data

In [5]:
root = "../../data/external/"

#### Data Andrei

In [6]:
file_name = root + 'Dataset_Andrei.csv'
df_Andrei = pd.read_csv(file_name)

# Get Dummies to match DF Pierre
df_Andrei_dummies = pd.get_dummies(df_Andrei.iloc[:, 2:])
# Set species back
df_Andrei_dummies = df_Andrei_dummies.set_index(df_Andrei['Species'])

# Create tuple list for multi index
Andrei_multi_index = []
for top_index in df_Andrei.columns:
    for sub_index in df_Andrei_dummies.columns:
        if top_index in sub_index:

            sub_index = sub_index.split('_')[-1]
            Andrei_multi_index.append((top_index, sub_index))

# Set Mutli index
df_Andrei_dummies.columns = pd.MultiIndex.from_tuples(Andrei_multi_index)
df_Andrei_dummies

Unnamed: 0_level_0,Life form,Life form,Leaf position,Leaf position,Leaf position,Leaf position,Leaf position,Leaf composition,Leaf composition,Leaf composition,...,Aril colour,Aril colour,Aril colour,Aril colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour
Unnamed: 0_level_1,liana,tree,alternate,"alternate, opposite",opposite,"opposite, whorls of 3","opposite, whorls of 3, alternate",3 palmate,3-5 palmate,3-5 pinnate,...,orange,red,white,yellow-geen,black,brown,green,grey,white,whitish
Species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Avicennia germinans,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Metopium brownei,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Handroanthus billbergii,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
Bourreria succulenta,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bursera karsteniana,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bursera simaruba,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bursera tomentosa,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cynophalla flexuosa,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
Cynophalla hastata,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
Quadrella indica,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


#### Data Pierre

In [7]:
file_name = root + 'Dataset_Pierre.csv'
df_Pierre = pd.read_csv(file_name, header=[0, 1]) 
df_Pierre = df_Pierre.iloc[: , 1:]
df_Pierre = df_Pierre.set_index(df_Pierre['Species']['species'])
df_Pierre = df_Pierre.drop(columns=['Species', 'XXX'])
for i in df_Pierre:
    print(i)

('plant type', 'Tree')
('plant type', 'Shrub')
('plant type', 'Bush')
('plant type', 'Ficus strangler')
('plant type', 'Liana')
('plant type', 'Parasitic')
('plant type', 'Palm tree')
('plant type', 'Herbaceous')
('phyllotaxis', 'Phyllotaxis alternate')
('phyllotaxis', 'Opposite phyllotaxis')
('phyllotaxis', 'Phyllotaxis whorled')
('trunk and root', 'Base of trunk straight')
('trunk and root', 'Base of trunk flared')
('trunk and root', 'Foothills')
('trunk and root', 'Stilt roots')
('trunk and root', 'Aerial roots')
('latex', 'No exudate')
('latex', 'Clear latex')
('latex', 'Clear gum or resin')
('latex', 'White latex')
('latex', 'Yellow latex')
('latex', 'Dark gum or resin')
('crown', 'Regular density of the crown')
('crown', 'Leaves grouped at the top of the axes')
('crown', 'Horizontally spreading crowns')
('crown', 'drooping axles')
('crown', 'Stepped crown')
('crown', 'Candelabra-shaped crown')
('stem shape', 'Circular stem section')
('stem shape', 'Square stem section')
('bark', 

  df_Pierre = df_Pierre.drop(columns=['Species', 'XXX'])


#### Data Palms

In [8]:
file_name = root + 'Dataset_Kissling.txt'
df_Daniel = pd.read_csv(file_name,
                 sep='\t', encoding='Latin-1')
palm_species = df_Daniel[~df_Daniel.isnull().any(axis=1)]['SpecName'].values
df_Daniel.set_index('SpecName', inplace=True)

### RESHAPE DATA FOR PALMS

# Exclude string types
df_Daniels_int = df_Daniel.select_dtypes(exclude=[object])
df_Daniels_str = df_Daniel.select_dtypes(include=[object])
# Drop numbers
df_Daniels_semi_ints = df_Daniels_int.loc[:, df_Daniels_int.max() <= 3]
# Merge again
df_Daniel_edit = pd.merge(df_Daniels_str, df_Daniels_semi_ints, left_index=True, right_index=True)


# Real numbers:
df_Daniels_real_ints = df_Daniels_int.loc[:, df_Daniels_int.max() >= 3]
columns = [
    ("Measurement", "Maximum Stem Height"),
    ("Measurement", "Maximum Stem Diameter"),
    ("Measurement", "Maximum Leaf Number"),
    ("Measurement", "Maximum Leaf Blade Length"),
    ("Measurement", "Maximum Rachis Length"),
    ("Measurement", "Maximum Petiole Length"),
    ("Measurement", "Average Fruit Length"),
    ("Measurement", "Minimum Fruit Length"),
    ("Measurement", "Maximum Fruit Length"),
    ("Measurement", "Average Fruit Width"),
    ("Measurement", "Minimum Fruit Width"),
    ("Measurement", "Maximum Fruit Width"),
]

df_Daniels_real_ints.columns = pd.MultiIndex.from_tuples(columns)
df_Daniels_real_ints

# Get colors as lst of lsts 
FruitColorDescription_colors_lst = []

for palm_colors in df_Daniels_str['FruitColorDescription'].values:
    if type(palm_colors) == str:
        #print(type(colors))
        palm_colors = re.split(r'; |to | |-', palm_colors)

        #print(palm_colors)
        FruitColorDescription_colors_lst.append([color for color in palm_colors if color in colors])
    else:
        FruitColorDescription_colors_lst.append([])

MainFruitColors_colors_lst = []

for palm_colors in df_Daniels_str['MainFruitColors'].values:
    if type(palm_colors) == str:
        #print(type(colors))
        palm_colors = re.split(r'; |to | |-', palm_colors)

        #print(palm_colors)
        MainFruitColors_colors_lst.append([color for color in palm_colors if color in colors])
    else:
        MainFruitColors_colors_lst.append([])

# Init SKlearn MLB
mlb = MultiLabelBinarizer()

# Create dummies for color columns
df_FruitColorDescription = pd.DataFrame(
    {
        'FruitColorDescription': FruitColorDescription_colors_lst
    }, columns=['FruitColorDescription'])

s = df_FruitColorDescription['FruitColorDescription']
df_FruitColorDescription = pd.DataFrame(mlb.fit_transform(s),columns=mlb.classes_, index=df_Daniel.index)

# Multiindex columns
columns = [('Fruit Colour Description', column) for column in df_FruitColorDescription.columns]
df_FruitColorDescription.columns = pd.MultiIndex.from_tuples(columns)

# Create dummies for color columns
df_MainFruitColors = pd.DataFrame(
    {
        'MainFruitColors': MainFruitColors_colors_lst
    }, columns=['MainFruitColors'])

s = df_MainFruitColors['MainFruitColors']
df_MainFruitColors = pd.DataFrame(mlb.fit_transform(s),columns=mlb.classes_, index=df_Daniel.index)

# Multiindex columns
columns = [('Fruit Colour', column) for column in df_MainFruitColors.columns]
df_MainFruitColors.columns = pd.MultiIndex.from_tuples(columns)

df_Daniels_str_non_color = df_Daniels_str[['UnderstoreyCanopy', 'FruitSizeCategorical', 'FruitShape', 'Conspicuousness']]

# df_Daniels_str_non_color.columns = pd.MultiIndex.from_tuples(
#     [
#         ('Crown', 'UnderstoreyCanopy'),
#         ('Fruit Size', 'FruitSizeCategorical'),
#         ('Fruit Shape', 'FruitShape'),
#         ('Conspicuousness', 'Conspicuousness'),
#     ]
# )

df_Daniels_str_non_color_dummies = pd.get_dummies(df_Daniels_str_non_color)
columns = []
for column in df_Daniels_str_non_color_dummies.columns:
    level0, level1 = column.split('_')
    if level0 == 'UnderstoreyCanopy':
        level0 = 'Crown'
    elif level0 == 'FruitSizeCategorical':
        level0 = 'Fruit Size'
    elif level0 == 'FruitShape':
        level0 = 'Fruit Shape'
    elif level0 == 'Conspicuousness':
        level0 = 'Conspicuousness'
    columns.append((level0, level1))
    
df_Daniels_str_non_color_dummies.columns = pd.MultiIndex.from_tuples(columns)

### JOIN ALL DATA
data_frames = [df_FruitColorDescription, df_MainFruitColors, df_Daniels_str_non_color_dummies, df_Daniels_real_ints]
df_Daniel_merged = pd.concat(data_frames, axis=1)
df_Daniel_merged

Unnamed: 0_level_0,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,Fruit Colour Description,...,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement
Unnamed: 0_level_1,black,blue,bluish,brown,brownish,chocolate,coral,crimson,darkgreen,green,...,Maximum Leaf Number,Maximum Leaf Blade Length,Maximum Rachis Length,Maximum Petiole Length,Average Fruit Length,Minimum Fruit Length,Maximum Fruit Length,Average Fruit Width,Minimum Fruit Width,Maximum Fruit Width
SpecName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Acanthophoenix crinita,1,0,0,0,0,0,0,0,0,0,...,15.0,2.30,,,0.65,0.6,0.7,0.50,,
Acanthophoenix rousselii,1,0,0,0,0,0,0,0,0,0,...,,3.00,,,2.00,,,0.80,,
Acanthophoenix rubra,1,0,0,0,0,0,0,0,0,0,...,20.0,3.10,3.0,,1.00,,,0.70,,
Acoelorrhaphe wrightii,1,0,0,1,0,0,0,0,0,0,...,25.0,1.30,0.7,0.65,0.70,,,0.70,0.5,0.9
Acrocomia aculeata,0,0,0,0,0,0,0,0,0,1,...,30.0,3.50,2.5,,4.25,3.5,5.0,4.60,3.8,5.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wettinia quinaria,0,0,0,1,0,0,0,0,0,1,...,6.0,5.50,3.2,1.35,2.75,2.0,3.5,2.00,1.5,2.5
Wettinia radiata,0,0,0,0,0,0,0,0,0,0,...,6.0,4.33,3.4,,4.00,3.0,4.0,2.00,,
Wettinia verruculosa,0,0,0,1,0,0,0,0,0,1,...,6.0,6.00,3.7,2.40,2.50,,,1.50,,
Wodyetia bifurcata,0,0,0,0,0,0,0,0,0,0,...,10.0,3.20,2.3,1.02,5.75,5.0,6.5,4.35,2.7,6.0


## Prediction Data

### Load Data

In [9]:
root = "../../data/processed/"
sentences_all = {}

f = open(F"{root}Sentences_Pierre.pkl", 'rb')
sentences_Pierre = pickle.load(f)
sentences_all |= sentences_Pierre 

f = open(F"{root}Sentences_Andrei.pkl", 'rb')
sentences_Andrei = pickle.load(f)
sentences_all |= sentences_Andrei 

f = open(F"{root}Sentences_Kissling.pkl", 'rb')
sentences_Kissling = pickle.load(f)
sentences_all |= sentences_Kissling 

# Drop duplicates
for species, sentences in sentences_all.items():
    sentences_all[species] = list(set(sentences))


### Match Data

In [10]:
species_traits = {}

for idx, (species, sentences) in enumerate(tqdm(sentences_all.items())):

    senteces_matches = collections.defaultdict(list)
    
    # if idx >= 1:
    #     continue

    for sentence in sentences:

        sentence_list = re.split(r' |,', sentence)

        matches =  set(sentence_list) & set(traits_list)
        # print(matches)
        matches_color = set(colors) & set(sentence_list)
        if matches and not matches_color:
            #print(match, sentence)
            for match in matches:

                corresponding_traits = corresponding_keys(match.capitalize(), traits_dict)

                for corresponding_trait in corresponding_traits:
                    senteces_matches[corresponding_trait].append(sentence)

                    #print(corresponding_trait, sentence)

        elif matches_color and matches:
            for match in matches:

                corresponding_traits = corresponding_keys(match.capitalize(), traits_dict)
                for corresponding_trait in corresponding_traits:
                    if 'Colour' in corresponding_trait.split(' '):
                        senteces_matches[corresponding_trait].append(sentence)

                        #print("COLOR", corresponding_trait, sentence)

    species_traits[species] = senteces_matches

100%|██████████| 647/647 [00:00<00:00, 1091.74it/s]


In [17]:
species_traits

{'Acacia amythethophylla': defaultdict(list,
             {'Fruit': ['Fruit flat, dehiscent.'],
              'Fruit Colour': ['Fruit flat, dehiscent.'],
              'Stem': ['Acacia confusa - Stem Smaller peaks suspected id only.'],
              'Sexuality': ['Inflorescence large, terminal, much-branched, composed of orange-yellow spherical heads.'],
              'Life Form': ['Tree, shrub over 2 m.',
               'Small or medium tree 2â\x80\x9312 m. high, bark rough, fissured, grey.',
               'A tree 20â\x80\x9340 ft. high, sometimes scarcely armed.',
               'Small or medium tree 2-15 m. high, bark rough, fissured, grey.'],
              'Bark': ['The branch bark is apparently 1.5% approx.',
               'Small or medium tree 2â\x80\x9312 m. high, bark rough, fissured, grey.',
               'Small or medium tree 2-15 m. high, bark rough, fissured, grey.'],
              'Bark Colour': ['The branch bark is apparently 1.5% approx.',
               'Small or med

In [11]:
k = 5
google_forms_lst = []

for idx, species in enumerate(tqdm(species_traits.keys())):

    # if idx >= 1:
    #     continue

    df_name = ''
    df_select = [df_Andrei_dummies, df_Pierre, df_Daniel_merged]
    if species in df_Andrei_dummies.index:
        df_name = 'Andrei'
        df_select = df_select[0]
    elif species in df_Pierre.index:
        df_name = 'Pierre'
        df_select = df_select[1]
    elif species in df_Daniel_merged.index:
        df_name = 'Kissling'
        df_select = df_select[2]
    else:
        print(species, 'Missing?', df_name)

    for trait in species_traits[species]:
        
        # Init 
        top_k_list = []
        # Match traits
        df_trait = match_trait_against_DF(trait, df_select, df_name)

        # Get present subtraits
        try:
            df_subset = df_select[df_select.index == species][df_trait]
            present_traits = df_subset.loc[:, df_subset.any()].columns

        # Skip traits without GT
        except:
            continue
        
        # Join items
        if type(df_trait) == str:
            df_sent  = ' '.join(df_trait + ' ' + present_traits)
        else:
            # Mutiple sub traits
            df_sent = ' '.join(df_trait + [item for sublist in present_traits for item in sublist])

        # Match against Sentences
        for sentence in species_traits[species][trait]:

            j_sim = jaccard_similarity(df_sent, sentence)
            # Append to list
            top_k_list.append((j_sim, sentence))

        # Sort list and keep top K
        top_k_list.sort(reverse=True)

        #print(trait, top_k_list)
        top_sentences = []
        for (_, sentence) in top_k_list:
           
            # Skip duplicated and remove jsim
            if sentence not in top_sentences:
                top_sentences.append(sentence)

        # Google Forms
        #google_forms_lst.append([species, trait, df_trait, present_traits.values])
        best_sentences = []

        # Extend lists below 5
        if len(top_sentences) < k:
            empties = [None] * (k - len(top_sentences))
            for empty in empties:
                top_sentences.append([empty])

        for i, top_sentence in enumerate(top_sentences):
            if i >= k:
                continue
            # reconstructed_sent = resentesize(top_sentence)
            if top_sentence:
                best_sentences.append(top_sentence)
            else:
                best_sentences.append(np.NaN)

        #google_forms_lst.append(best_sentences)
        google_forms_lst.append((species, trait, df_trait, list(present_traits.values), *best_sentences))
    


100%|██████████| 647/647 [00:06<00:00, 103.37it/s]


In [12]:
df_trait 

''

### Create Subset

In [13]:
# Google forms
df_google = pd.DataFrame(google_forms_lst, columns=['Species', 'Main Trait', 'GT Main Trait', 'GT Sub Traits', '1', '2', '3', '4', '5'])
df_google

Unnamed: 0,Species,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5
0,Acacia amythethophylla,Fruit,fruit,[Pod],"Fruit flat, dehiscent.",[None],[None],[None],[None]
1,Acacia amythethophylla,Stem,stem shape,[Circular stem section],Acacia confusa - Stem Smaller peaks suspected ...,[None],[None],[None],[None]
2,Acacia amythethophylla,Sexuality,sexuality,[Terminal sexuality],"Inflorescence large, terminal, much-branched, ...",[None],[None],[None],[None]
3,Acacia amythethophylla,Life Form,plant type,"[Tree, Shrub]","Small or medium tree 2-15 m. high, bark rough,...","Small or medium tree 2â12 m. high, bark roug...","Tree, shrub over 2 m.","A tree 20â40 ft. high, sometimes scarcely ar...",[None]
4,Acacia amythethophylla,Bark,bark,"[Presence of spines or prickles on the trunk, ...",The branch bark is apparently 1.5% approx.,"Small or medium tree 2-15 m. high, bark rough,...","Small or medium tree 2â12 m. high, bark roug...",[None],[None]
...,...,...,...,...,...,...,...,...,...
3541,Jacquinia arborea,Fruit,Fruit type,[berry],Calyx lobes very broadly ovate or suborbicular...,[None],[None],[None],[None]
3542,Jacquinia arborea,Spine,Thorns/spines,[absent],"Leaves pseudoverticillate, sometimes indistinc...",[None],[None],[None],[None]
3543,Guaiacum officinale,Fruit,Fruit type,[capsule],Fruit is a bilocular and cordate capsule which...,"Fruit a capsule, slightly stalked, five-angled...","Fruits: Dehiscent capsule, 0.75"" long.",[None],[None]
3544,Guaiacum sanctum,Fruit,Fruit type,[capsule],"Fruit covering: fleshy, 5-winged capsule.",Fruit length: Â½ inch.,"Showy, Evergreen, Fruit & Berries.",[None],[None]


In [14]:
df_google_subset = df_google\
                    .dropna()\
                    .sample(n=20, axis=0, random_state=333)\

df_google_subset

Unnamed: 0,Species,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5
1708,Ficus polita,Leaf Apex,leaf apex,[Leaf apex acuminate],"Leaves spirally arranged, lamina ovate to Â± e...","Leaves in spirals, lamina coriaceous or subcor...",[None],[None],[None]
2494,Oncoba spinosa,Leaf Apex,leaf apex,[Leaf apex acuminate],"The leaves are simple, ovate-elliptic in form ...","Glabrous shrub or small tree, to 5 m. Branches...",The leaf-shape is ovate to elliptic with a tap...,[None],[None]
265,Agave sisalana,Sexuality,sexuality,[],The terminal spines are removed before or afte...,[None],[None],[None],[None]
2853,Rotula aquatica,Life Form,plant type,"[Shrub, Bush]",A stiff-branched shrub 3-6 ft. high.,Takad is a shrub or bush that grows to a heigh...,[None],[None],[None]
2043,Hymenocardia acida,Bark,bark,"[Smooth bark, Finely fissured bark, Bark peeli...",Branchlets becoming rusty-powdery when the bar...,The bitter stem bark is slightly astringent an...,[None],[None],[None]
2429,Margaritaria discoidea,Life Form,plant type,"[Tree, Shrub, Bush]",A medium to tall tree in forest and riverine s...,"A many-stemmed, densely branched spreading to ...",A much-branched shrub 1â5 m. tall or tree up...,"A large deciduous shrub or tree, to 100 ft. high.",[None]
3414,Ximenia americana,Life Form,plant type,"[Tree, Shrub, Bush]",Its branchlets are purple-red with a waxy bloo...,Ximenia americana is a semiscandent plant that...,Ximenia americana is a semi-scandent shrub or ...,[None],[None]
2973,Securidaca longipedunculata,Bark Colour,bark color,"[Slice in light bark, Slice through brown bark...","It has pale grey, smooth bark with leaves that...",[None],[None],[None],[None]
2305,Leucaena leucocephala,Inflorescences,inflorescences,[Glomerule of flowers or inflorescences],The inflorescence is usually borne on actively...,"Flowers arranged on compact globose heads, the...",[None],[None],[None]
1574,Ficus abutilifolia,Leaf Base,leaf base,[Leaf base cordate],Leaves: Altemate in spirals with petiole 2-18 ...,The leaves are broadly ovate and heart-shaped ...,[None],[None],[None]


### Add Images

In [15]:
random_sample_species = df_google_subset['Species'].values

images_links = {}
for species in tqdm(random_sample_species):
    species_ = species.replace(' ', '_')
    try:
        img_url = get_wiki_main_image(species_)
        if img_url[-3:] != 'jpg':
            img_url = 'https://upload.wikimedia.org/wikipedia/commons/1/14/No_Image_Available.jpg'
        # if img_url[-3:].isupper():
        #     img_url = 'https://upload.wikimedia.org/wikipedia/commons/1/14/No_Image_Available.jpg'
        images_links[species] = img_url
    except:
        images_links[species] = 'https://upload.wikimedia.org/wikipedia/commons/1/14/No_Image_Available.jpg'

# df_google_subset = df_google[df_google['Species'].isin(random_sample_species)]
df_google_subset = df_google_subset.set_index('Species')
df_google_subset["URL"] = pd.Series(images_links)
# df_google_subset.to_csv(f'{root}top_sents_all_AllSentencesAgainstTrait_Random20Subset.csv', sep='\t')#, quoting=csv.QUOTE_ALL)

100%|██████████| 20/20 [00:03<00:00,  5.38it/s]


In [16]:
df_google_subset

Unnamed: 0_level_0,Main Trait,GT Main Trait,GT Sub Traits,1,2,3,4,5,URL
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ficus polita,Leaf Apex,leaf apex,[Leaf apex acuminate],"Leaves spirally arranged, lamina ovate to Â± e...","Leaves in spirals, lamina coriaceous or subcor...",[None],[None],[None],https://upload.wikimedia.org/wikipedia/commons...
Oncoba spinosa,Leaf Apex,leaf apex,[Leaf apex acuminate],"The leaves are simple, ovate-elliptic in form ...","Glabrous shrub or small tree, to 5 m. Branches...",The leaf-shape is ovate to elliptic with a tap...,[None],[None],https://upload.wikimedia.org/wikipedia/commons...
Agave sisalana,Sexuality,sexuality,[],The terminal spines are removed before or afte...,[None],[None],[None],[None],https://upload.wikimedia.org/wikipedia/commons...
Rotula aquatica,Life Form,plant type,"[Shrub, Bush]",A stiff-branched shrub 3-6 ft. high.,Takad is a shrub or bush that grows to a heigh...,[None],[None],[None],https://upload.wikimedia.org/wikipedia/commons...
Hymenocardia acida,Bark,bark,"[Smooth bark, Finely fissured bark, Bark peeli...",Branchlets becoming rusty-powdery when the bar...,The bitter stem bark is slightly astringent an...,[None],[None],[None],https://upload.wikimedia.org/wikipedia/commons...
Margaritaria discoidea,Life Form,plant type,"[Tree, Shrub, Bush]",A medium to tall tree in forest and riverine s...,"A many-stemmed, densely branched spreading to ...",A much-branched shrub 1â5 m. tall or tree up...,"A large deciduous shrub or tree, to 100 ft. high.",[None],https://upload.wikimedia.org/wikipedia/commons...
Ximenia americana,Life Form,plant type,"[Tree, Shrub, Bush]",Its branchlets are purple-red with a waxy bloo...,Ximenia americana is a semiscandent plant that...,Ximenia americana is a semi-scandent shrub or ...,[None],[None],https://upload.wikimedia.org/wikipedia/commons...
Securidaca longipedunculata,Bark Colour,bark color,"[Slice in light bark, Slice through brown bark...","It has pale grey, smooth bark with leaves that...",[None],[None],[None],[None],https://upload.wikimedia.org/wikipedia/commons...
Leucaena leucocephala,Inflorescences,inflorescences,[Glomerule of flowers or inflorescences],The inflorescence is usually borne on actively...,"Flowers arranged on compact globose heads, the...",[None],[None],[None],https://upload.wikimedia.org/wikipedia/commons...
Ficus abutilifolia,Leaf Base,leaf base,[Leaf base cordate],Leaves: Altemate in spirals with petiole 2-18 ...,The leaves are broadly ovate and heart-shaped ...,[None],[None],[None],https://upload.wikimedia.org/wikipedia/commons...
