In [1]:
import spacy
import pickle
from spacy import displacy
nlp = spacy.load('en_core_web_trf')
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm as tqdm_notebook
import collections
import re
import pandas as pd
import math
import random
import numpy as np
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
from netgraph import Graph
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline

In [2]:
# Color checker
colors = list(mcolors.CSS4_COLORS.keys())

## Create a glossary list

In [3]:
# URL
URL = 'https://en.wikipedia.org/wiki/Glossary_of_plant_morphology'
# Get the page
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, "lxml", from_encoding="iso-8859-1")   

glossary = collections.defaultdict(list)
# Find all H4 
for chapter in soup.find_all('h4')[0:]:
    # Clean
    chapter_text = chapter.text.rstrip('[edit]')
    # Find all siblings
    for sibling in chapter.find_next_siblings():
        # Find the parent
        for parent in sibling.find_previous_sibling('h4'):
            # Only append if correspond to current chapter
            if parent.text == chapter_text:
                if 'â' in sibling.text:
                    for tag in sibling.find_all('li'):
                        candidates = tag.text.split('â')[0]
                        candidates = candidates.split('/')
                        for candidate in candidates:
                            glossary[chapter_text.lower()].append(candidate.strip().lower())

In [4]:
glossary.keys()

dict_keys(['morphology', 'roots', 'stems', 'buds', 'leaves', 'basic flower parts', 'inflorescences', 'insertion of floral parts', 'union of flower parts', 'flower sexuality and presence of floral parts', 'flower symmetry', 'terms for fruits', 'fruit types', 'pteridophytes', 'bryophytes'])

In [5]:
#glossary['fruit types'] += [
#    'fruit',
#]

glossary['leaves'] += [
    'glume',
    'surface',
    'margin'
]

glossary['basic flower parts'] +=[
    'floret',
    'awn',
    
]
glossary['inflorescences'] += [
    'spikelets',
    'lemma',
    'racemes',
    'axis',
]
glossary['leaves'] += [
    'rhachilla'
]

glossary['other'] += [
    'apex',
    'culm',
    'tube',
    'palea',
    'crown',
    'canopy',

]

### Load the data

In [6]:
DATA = pickle.load(open('../../data/description/04_TRAIN_0000000-0014557_PLANTS.pkl', 'rb'))

In [7]:
descriptions_id = collections.defaultdict(list)
descriptions = collections.defaultdict(list)

compounds = [
    'fertile', 'sterile',
    'male', 'female', 'bisexual',
    'basal', 'developed', 
    'primary', 'secondary', 'main',
    'upper', 'lower', 'greater', 'dorsal', 'alternate', 'lesser', 'apex', 'outer',
    'central', 'outermost', 'outer', 'inner', 'uppermost', 'median', 'dorsal', 'central', 'lateral',
]

def compound_reconstructor(token, doc):
    if token.i == 0:
        trait = token
    elif doc[token.i - 3].dep_ == 'compound':
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 3].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 2].dep_ == 'compound':
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 2].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 1].dep_ == 'compound':
        trait = doc[token.i - 1: token.i + 1]
    elif doc[token.i - 1].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 1: token.i + 1]
    else:
        trait = token   
    return trait.lemma_

def check_existance(t, doc):
    single = next((key for key, value in glossary.items() if t.lemma_.lower() in value), None)
    multi = next((key for key, value in glossary.items() if t.text.lower() in value), None)
    if single:
        return single
    elif multi:
        return multi
    else:
        return None

def extract_advmod(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'advmod':
            return child
        
def extract_nummod(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'nummod':
            return child

def extract_conjunction(t, doc):
    """HELPER"""
    if t.dep_ == 'conj' and t.pos_ == 'ADJ':
        return t 

def extract_amod(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'amod':
            return child
        
def extract_measurements(t, doc):
    """HELPER"""
    obj = None
    measurements = ['wide', 'long']
    if t.text in measurements or t.lemma_ in measurements:
        obj = doc[t.left_edge.i : t.right_edge.i + 1]
    return obj

def extract_prepositions(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'prep':
            return doc[child.left_edge.i : child.right_edge.i + 1]

def define_position(x, y, doc):
    """HELPER"""
    if len(x.text.split()) > 1:
        return f'{y.text} {x.text}'
    else:
        try:
            if x.i > y.i:
                return doc[y.i : x.i + 1]
            else:
                return doc[x.i : y.i + 1]
        except:
            return f'{y.text} {x.text}'

def extract_noun_verbs(t, doc):
    relations = []
    objects = []
    if t.dep_ not in ['ROOT', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
        return '', ''
    parent = next((parent for parent in t.ancestors), None)
    if parent and parent.pos_ == 'VERB':
        prep = extract_verb_prep(parent, doc)
        dobj = extract_verb_dobj(parent, doc)
        oprd = extract_verb_orpd(parent, doc)
        agnt = extract_verb_agnt(parent, doc)
        nmod = extract_verb_nmod(parent, doc)
        advm = extract_verb_advm(parent, doc)
        print(advm)


def extract_verb_advm(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'advmod':
            return child

def extract_verb_nmod(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'nummod':
            return doc[child.left_edge.i : child.right_edge.i + 1] 
        
def extract_verb_prep(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'prep':
            return child    

def extract_verb_pobj(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'pobj' or child.dep_ == 'pcomp' or child.dep_ == 'prep':
            return child
        
def extract_verb_dobj(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'dobj':
            return child
        
def extract_verb_orpd(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'oprd':
            return child    
        
def extract_verb_agnt(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'agent':
            return child 
        
def extract_nounandverb_nummods(t, doc):
    obj = None
    for child in t.children:
        if child.dep_ == 'nummod':
            obj = doc[child.left_edge.i : child.right_edge.i + 1]
            return obj   

def extract_dnummod(t, doc):
    obj = extract_nounandverb_nummods(t, doc)
    if obj:
        return obj.text
    else:
        return None

def extract_noun_adjectives(t, doc):
    adjs = []
    adjectives = []
    for child in t.children:
        if child.dep_ == 'appos':
            continue
        if child.pos_ =='ADJ' or child.tag_ == 'VBN' and child.dep_ in ['conj', 'amod']:
            
            
            advmod = extract_advmod(child, doc)
            measurement = extract_measurements(child, doc)
            prep = extract_prepositions(child, doc)
            nummod = extract_nummod(child, doc)
            amod = extract_amod(child, doc)
           
            
            if child.lemma_.lower() in compounds:
                continue
            if child.text.lower() in compounds:
                continue
            elif amod:
                obj = define_position(amod, child, doc)
                adjs.append(obj)
            elif advmod:
                #obj = doc[advmod.i : child.i + 1]
                obj = define_position(advmod, child, doc)
                adjs.append(obj)
            elif measurement:
                obj = measurement
                adjs.append(obj)
            elif prep:
                obj = define_position(prep, child, doc)
                adjs.append(obj)
            elif nummod:
                obj = define_position(nummod, child, doc)
                adjs.append(obj)                
            else:
                obj = child
                adjs.append(obj)
            for grandchild in child.subtree:
                conj = extract_conjunction(grandchild, doc)
                if conj:
                    advmod = extract_advmod(conj, doc)
                    prep = extract_prepositions(conj, doc)
                    nummod = extract_nummod(conj, doc)
                    if advmod:
                        obj = define_position(advmod, conj, doc)
                        adjs.append(obj)
                    elif prep:
                        obj = define_position(prep, conj, doc)
                        adjs.append(obj)
                    elif nummod:
                        obj = define_position(nummod, conj, doc)
                        adjs.append(obj)
                    else:
                        obj = conj
                        adjs.append(obj)            
    for adj in adjs:
        try:
            if adj.pos_ == 'VERB':
                adj_text = adj.text.lower()
            elif adj.root.pos_ == 'VERB':
                adj_text = adj.text.lower()
            else:
                adj_text = adj.lemma_.lower()
        except:
                if type(adj) == str:
                    adj_text = adj.lower()
                else:
                    adj_text = adj.text.lower()
        for adj_split in adj_text.split(','):
            adjectives.append(adj_split.strip())
    return adjectives

def extract_noun_appos(t, doc):
    appos = []
    for child in t.children:
        if child.dep_ == 'appos':
            obj = doc[child.left_edge.i : child.right_edge.i + 1].text.lower()
            for obj_split in obj.split(','):
                appos.append(obj_split.strip())
    return appos

def check_species(t, species, doc):
    if t.text in species.split():
        return True
    else:
        return False
    
def extract_auxillary(t, doc):
     parent = next((parent for parent in t.ancestors if parent.pos_ == 'AUX'), None)
        
        

In [17]:
descriptions = collections.defaultdict(list)

# For plotting purposes
parts = []
traits = []
for species in tqdm_notebook(list(DATA.keys())[1:2]):
    for idx, text in enumerate(DATA[species][1:2]):


#for species in tqdm_notebook(COMMON_species[0:]):
#for species in tqdm_notebook(list(descriptions_text.keys())):
#    for idx, text in enumerate(TEST[species][0:]):
        # Clean the text
        text = re.sub(r'(?<!\d)\.(?!\d)', ' ', text)
        text = re.sub(r'\s×\s', ' times ', text)
        text = f'{text.strip()}.'
        # Reset variables
        part=trait=rel=obj=adjectives = None 
        # NLP
        doc = nlp(text)
        # Init
        descriptions[species, idx] = []
        triples = []
        # Loop over tokens
        for t in doc:
            if t.dep_ == 'compound':
                continue
            ### SUBJECTS ###    
            if t.pos_ == 'NOUN' or t.pos_ == 'PROPN':
                # Check existance of parts
                part = check_existance(t, doc)
                if part:
                    # Reconstruct Compounds & Append
                    trait = compound_reconstructor(t, doc)
                    triples.append((species, 'has main part', part))
                    triples.append((part, 'has sub part', trait))

                    
                    # Plotting
                    #parts.append(part)
                    #traits.append(trait)
                    # NOUN ADJECTIVES
                    adjectives = extract_noun_adjectives(t, doc)
                    for adjective in adjectives:
                        triples.append((trait, 'is', adjective))
                    # NOUN VERBS
                    verbs_rel, verbs_obj = extract_noun_verbs(t, doc)
                    for rel, obj in zip(verbs_rel, verbs_obj):
                        triples.append((trait, rel, obj))
                    # NOUN APPOSITIONAL MODIFIER
                    adjectives = extract_noun_appos(t, doc)
                    for adjective in adjectives:
                        triples.append((trait, 'is', adjective))
                    # NOUN NUMMODS
                    nummod = extract_dnummod(t, doc)
                    triples.append((trait, 'is', nummod))
                
            #if check_species(t, species, doc):

        # APPEND
        descriptions[species, idx] = [triple for triple in triples if all(triple)]     
        
                    
        #print(idx, doc)
        print(descriptions[species, idx])
        #print('\n')

  0%|          | 0/1 [00:00<?, ?it/s]

TypeError: cannot unpack non-iterable NoneType object

In [18]:
def extract_noun_verbs_ROOT(t, doc):
    relations = []
    objects = []
    if t.dep_ not in ['ROOT', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
        return '', ''
    parent = next((parent for parent in t.ancestors), None)
    print(parent)
    if parent and parent.pos_ == 'VERB':
        prep = extract_verb_prep(parent, doc)
        if prep: relations.append(f'{parent.text} {prep}'), objects.append(extract_verb_pobj(prep, doc).lemma_)
        dobj = extract_verb_dobj(parent, doc)
        if dobj: relations.append(parent.text), objects.append(extract_verb_pobj(prep, doc).lemma_)
        oprd = extract_verb_orpd(parent, doc)
        
        agnt = extract_verb_agnt(parent, doc)
        nmod = extract_verb_nmod(parent, doc)
        advm = extract_verb_advm(parent, doc)
        if advm: relations.append(parent), objects.append(advm)
        if not any((prep, dobj, oprd, agnt, nmod, advm)): relations.append('are'), objects.append(parent)
            
    return relations, objects

def extract_noun_verbs_NON_ROOT(t, doc):
    relations = []
    objects = []
    if t.dep_ not in ['ROOT', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
        return '', ''
    # Double check
    parent = next((parent for parent in t.ancestors), None)
    if not parent:
        for child in t.children:
            if child.pos_ == 'VERB' and child.dep_ != 'amod':
                prep = extract_verb_prep(child, doc)
                if prep: 
                    noun = extract_verb_pobj(prep, doc)
                    relations.append(f'{child.text} {prep}')
                    objects.append(doc[noun.left_edge.i : noun.right_edge.i + 1].text)
                dobj = extract_verb_dobj(child, doc)
                if dobj:
                    relations.append(child.text)
                    objects.append(doc[dobj.left_edge.i : dobj.right_edge.i + 1].text) 
                oprd = extract_verb_orpd(child, doc)
                if oprd:
                    oprd_prep = extract_verb_preps(oprd, doc)
                    relations.append(f'{child.text} {oprd.text}')
                    objects.append(doc[oprd_prep.left_edge.i : oprd_prep.right_edge.i + 1].text)  
                agnt = extract_verb_agnt(child, doc)
                if agnt:
                    print(33333)
                nmod = extract_verb_nmod(child, doc)
                if nmod:
                    relations.append('is')
                    objects.append(f'{nmod.text} {child}') 
                        
    return relations, objects

In [41]:
def extract_child_subtree(t, doc):
    for child in t.children:
        subtree_idx = [t.i for t in child.subtree]
        print(subtree_idx)
        

In [42]:
extract_child_subtree(doc[7], doc)

[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]


In [34]:
doc[10].head

comprising

In [47]:

for t in doc[10].children:
    
    print(t.i)

8
9
11
13
25


In [19]:
extract_noun_verbs_ROOT(doc[0], doc)

('', '')

In [20]:
extract_noun_verbs_NON_ROOT(doc[2], doc)

(['comprising'],
 ['2 subequal glumes without lemmas, linear, 2 mm long, shorter than fertile, separately deciduous'])

In [21]:
displacy.render(doc)