In [1]:
import spacy
import pickle
from spacy import displacy
nlp = spacy.load('en_core_web_trf')
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm as tqdm_notebook
import collections
import re
import pandas as pd
import numpy as np
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline

In [2]:
colors = list(mcolors.CSS4_COLORS.keys())

### Get Plant Glossary List

In [3]:
# URL
URL = 'https://en.wikipedia.org/wiki/Glossary_of_plant_morphology'
# Get the page
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, "lxml", from_encoding="iso-8859-1")   

glossary = collections.defaultdict(list)
# Find all H4 
for chapter in soup.find_all('h4')[0:]:
    # Clean
    chapter_text = chapter.text.rstrip('[edit]')
    # Find all siblings
    for sibling in chapter.find_next_siblings():
        # Find the parent
        for parent in sibling.find_previous_sibling('h4'):
            # Only append if correspond to current chapter
            if parent.text == chapter_text:
                if 'â' in sibling.text:
                    for tag in sibling.find_all('li'):
                        candidates = tag.text.split('â')[0]
                        candidates = candidates.split('/')
                        for candidate in candidates:
                            glossary[chapter_text.lower()].append(candidate.strip().lower())

In [4]:
glossary.keys()

dict_keys(['morphology', 'roots', 'stems', 'buds', 'leaves', 'basic flower parts', 'inflorescences', 'insertion of floral parts', 'union of flower parts', 'flower sexuality and presence of floral parts', 'flower symmetry', 'terms for fruits', 'fruit types', 'pteridophytes', 'bryophytes'])

In [5]:
glossary['leaves'] += [
    'glume',
    'surface',
    'margin'
]
glossary['basic flower parts'] +=[
    'floret'
]
glossary['inflorescences'] += [
    'spikelets',
    'lemma',
    'racemes',
    'axis'
]
glossary['leaves'] += [
    'rhachilla'
]

### Load the data

In [6]:
data = pickle.load(open('../data/description/04_TRAIN_0000000-0014557_PLANTS.pkl', 'rb'))

### Define Functions

### Extract data

In [249]:
descriptions = collections.defaultdict(list)

compounds = [
    'upper', 'lower',
    'dorsal', 'central',
]

for species in tqdm_notebook(list(data.keys())[0:1]):
    for idx, text in enumerate(data[species][0:5]):
        doc = nlp(text.lower().capitalize())
        
        for token in doc:            
            # Skip compounds
            if token.dep_ == 'compound':
                continue
            # Only (proper) NOUNS
            if token.pos_ not in ['NOUN', 'PROPN']:
                continue
            #else:
            #    print(token.lemma_.lower())
            #    print(token.lemma_.lower() in glossary['inflorescences'])
            for headlist, sublist in glossary.items():

                if token.lemma_.lower() in sublist or token.text.lower() in sublist:
                    
                    descriptions[species].append((species, 'base part', headlist))
                    
                    ### RECONSTRUCT COMPOUNDS ###
                    if token.i == 0:
                        trait = token
                    elif doc[token.i - 3].dep_ == 'compound':
                        trait = doc[token.i - 3: token.i + 1]
                    elif doc[token.i - 3].dep_ in compounds:
                        trait = doc[token.i - 3: token.i + 1]
                    elif doc[token.i - 2].dep_ == 'compound':
                        trait = doc[token.i - 2: token.i + 1]
                    elif doc[token.i - 1].dep_ == 'compound':
                        trait = doc[token.i - 1: token.i + 1]
                    elif doc[token.i - 1].text.lower() in compounds:
                        trait = doc[token.i - 1: token.i + 1]
                    else:
                        trait = token
                    descriptions[species].append((headlist, 'part', trait.lemma_))
                    
                    ### NORMAL SUBJECT TOKENS ###
                    if token.dep_ in ['nsubj', 'nsubjpass']:
                        parent = next(token.ancestors)
                        # Adjectival complement
                        if parent.lemma_ == 'be':
                            for child in parent.children:
                                if child.dep_ == 'acomp':
                                    if child.lemma_.lower() in ['long', 'wide']:
                                        descriptions[species].append((trait.lemma_, 'be', child.lemma_.lower()))
                                        tree = [t.text for t in list(child.subtree)[:-1]]
                                        tree = ' '.join(tree)
                                        descriptions[species].append((child.lemma_, 'be', tree))
                                    else:
                                        tree = [t.text for t in list(child.children) if t.dep_ == 'npadvmod']
                                        tree = ' '.join(tree) + ' ' +  child.text
                                        descriptions[species].append((trait.lemma_, 'be', tree))
                            #print('Do Something', idx)
                        # VERB ADJECTIVES
                        else:
                            descriptions[species].append((trait.lemma_, 'be', parent.text))
                            
                    ### ADJECTIVES TOKENS ###
                    for child in token.children:
                        ## NORMAL ADJECTIVES ##
                        if child.pos_ == 'ADJ':
                            if child.text.lower() in compounds:
                                continue
                            descriptions[species].append((trait.lemma_, 'be', child.lemma_))
                            for grandchild in child.children:
                                if grandchild.dep_ == 'nummod':
                                    tree = [t.lemma_ for t in list(grandchild.subtree)]
                                    tree = ' '.join(tree)
                                    descriptions[species].append((child.lemma_, 'NUM', tree))
                                elif grandchild.dep_ == 'npadvmod':
                                    tree = [t.lemma_ for t in list(grandchild.subtree)]
                                    tree = ' '.join(tree)
                                    descriptions[species].append((child.lemma_, 'NUM', tree))
                            ## conjunction ##
                            for grandchild in child.subtree:
                                #print(grandchild.dep_)
                                if grandchild.dep_ == 'conj' and grandchild.pos_ == 'ADJ':
                                    descriptions[species].append((trait.lemma_, 'be', grandchild.lemma_))
                        ## APPOS ADJECTIVES ##
                        if child.dep_ == 'appos' and child.pos_ == 'NOUN':
                            #print(333)
                            if child.text in colors:
                                descriptions[species].append((trait.lemma_, 'be', child.lemma_))
                            else:
                                descriptions[species].append((trait.lemma_, 'has', child.lemma_))
                            for grandchild in child.children:
                                if grandchild.dep_ == 'amod':
                                    descriptions[species].append((child.lemma_, 'be', grandchild.lemma_))                                
                                if grandchild.dep_ == 'nummod':
                                    tree = [t.lemma_ for t in list(grandchild.subtree)]
                                    tree = ' '.join(tree)
                                    descriptions[species].append((child.lemma_, 'NUM', tree))
                                elif grandchild.dep_ == 'npadvmod':
                                    tree = [t.lemma_ for t in list(grandchild.subtree)]
                                    tree = ' '.join(tree)
                                for grandgrandchild in grandchild.children:
                                    if grandgrandchild.dep_ == 'nummod':
                                        tree = [t.lemma_ for t in list(grandgrandchild.subtree)]
                                        tree = ' '.join(tree)
                                        descriptions[species].append((grandchild.lemma_, 'NUM', tree))
                                    elif grandgrandchild.dep_ == 'npadvmod':
                                        tree = [t.lemma_ for t in list(grandgrandchild.subtree)]
                                        tree = ' '.join(tree)
                                        descriptions[species].append((grandchild.lemma_, 'NUM', tree))
                    ## PARENT ADJECTIVES ##
                    if token.dep_ != 'ROOT':
                        parent = next(token.ancestors)
                        if parent.pos_ == 'ADJ':
                            descriptions[species].append((trait.lemma_, 'be', parent.lemma_))

                        
                


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
#with open('../../data/processed/PoS_plants_100.pkl', 'wb') as f:
#    pickle.dump(descriptions, f)      

In [280]:
descriptions

defaultdict(list,
            {'hiteochloa semitonsa': [('hiteochloa semitonsa',
               'base part',
               'stems'),
              ('stems', 'part', 'rhachilla internode'),
              ('rhachilla internode', 'be', 'elongated'),
              ('hiteochloa semitonsa', 'base part', 'basic flower parts'),
              ('basic flower parts', 'part', 'floret'),
              ('floret', 'be', 'proximal'),
              ('floret', 'be', 'fertile'),
              ('hiteochloa semitonsa', 'base part', 'leaves'),
              ('leaves', 'part', 'ligule'),
              ('ligule', 'has', 'fringe'),
              ('fringe', 'be', 'long'),
              ('long', 'NUM', '0.5 mm'),
              ('hiteochloa semitonsa', 'base part', 'inflorescences'),
              ('inflorescences', 'part', 'spikelet'),
              ('spikelet', 'be', 'fertile'),
              ('hiteochloa semitonsa', 'base part', 'leaves'),
              ('leaves', 'part', 'leaf-blade'),
              ('leaf-b

In [281]:
doc

Leaf-blades flat, or conduplicate, 6-11 cm long, 3-4 mm wide.

In [282]:
displacy.render(doc)

In [250]:
source   = []
relation = []
target   = []

for bird in descriptions.keys():
    for triple in descriptions[bird]:
        if not all(triple):
            continue

        source.append(triple[0])
        relation.append(triple[1])
        target.append(triple[2])
    
kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relation})

In [251]:
'''
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(25, 25))
labels = dict(zip(list(zip(kg_df.source, kg_df.target)),
                  kg_df['edge'].tolist()))
pos = nx.spring_layout(G, k = 0.3*1/np.sqrt(len(G.nodes())))

nx.draw(G,
        #node_color=color_map,
        with_labels=True, 
        node_size=2500, 
        pos=graphviz_layout(G, prog="dot"), )
nx.draw_networkx_edge_labels(G,
                             pos=graphviz_layout(G, prog="dot"), 
                             edge_labels=labels,
                             font_color='red', )
#plt.savefig('plot.pdf', format='pdf', dpi=1200, bbox_inches='tight')
plt.show()
'''

'\nG=nx.from_pandas_edgelist(kg_df, "source", "target", \n                          edge_attr=True, create_using=nx.MultiDiGraph())\n\nplt.figure(figsize=(25, 25))\nlabels = dict(zip(list(zip(kg_df.source, kg_df.target)),\n                  kg_df[\'edge\'].tolist()))\npos = nx.spring_layout(G, k = 0.3*1/np.sqrt(len(G.nodes())))\n\nnx.draw(G,\n        #node_color=color_map,\n        with_labels=True, \n        node_size=2500, \n        pos=graphviz_layout(G, prog="dot"), )\nnx.draw_networkx_edge_labels(G,\n                             pos=graphviz_layout(G, prog="dot"), \n                             edge_labels=labels,\n                             font_color=\'red\', )\n#plt.savefig(\'plot.pdf\', format=\'pdf\', dpi=1200, bbox_inches=\'tight\')\nplt.show()\n'

In [285]:
species = list(data.keys())
baseparts = list(glossary.keys())
parts = list(glossary.values())

G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiGraph())

plt.figure(figsize=(10, 10))
_labels = dict(zip(list(zip(kg_df.source, kg_df.target)),
                  kg_df['edge'].tolist()))

node_size = {}
for (subject, object_), edge in _labels.items():
    if subject in species:
        node_size[subject] = 10/3
    elif subject in baseparts:
        node_size[subject] = 8/3
    elif subject in parts:
        node_size[subject] = 4/3
    else:
        node_size[subject] = 4/3
    if object_ in species:
        node_size[object_] = 10/3
    elif object_ in baseparts:
        node_size[object_] = 8/3
    elif object_ in parts:
        node_size[object_] = 4/3
    else:
        node_size[object_] = 4/3
        
labels = {node : node for idx, node in enumerate(G)}
edge_labels = {}
for key in labels.keys():
    if key in baseparts:
        edge_labels[key] = 'basepart'
    else:
        edge_labels[key] = 'other'

Graph(G, node_layout='dot', edge_layout='curved', node_size=node_size, node_labels=labels, edge_labels=_labels, node_label_offset=(0.00, -0.04),
      edge_width=0.4, arrows=True,
      node_label_fontdict=dict(size=8),
      edge_label_fontdict=dict(size=6))


#plt.savefig('plot.pdf', format='pdf', dpi=1200, bbox_inches='tight')
plt.show()

ValueError: too many values to unpack (expected 2)

<Figure size 720x720 with 0 Axes>

In [277]:
labels

{'hiteochloa semitonsa': 'hiteochloa semitonsa0',
 'stems': 'stems1',
 'rhachilla internode': 'rhachilla internode2',
 'elongated': 'elongated3',
 'basic flower parts': 'basic flower parts4',
 'floret': 'floret5',
 'proximal': 'proximal6',
 'fertile': 'fertile7',
 'leaves': 'leaves8',
 'ligule': 'ligule9',
 'fringe': 'fringe10',
 'long': 'long11',
 '0.5 mm': '0.5 mm12',
 'inflorescences': 'inflorescences13',
 'spikelet': 'spikelet14',
 'leaf-blade': 'leaf-blade15',
 'flat': 'flat16',
 'conduplicate': 'conduplicate17',
 'wide': 'wide18',
 '6 - 11 cm': '6 - 11 cm19',
 '3 - 4 mm': '3 - 4 mm20'}

In [286]:
G['spikelet']

AdjacencyView({'inflorescences': {0: {'edge': 'part'}}, 'fertile': {0: {'edge': 'be'}}})

SyntaxError: invalid syntax (804705552.py, line 1)