In [1]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import requests
import random
import pickle
import re
import spacy
nlp = spacy.load('en_core_web_trf')
from spacy import displacy
import collections
from collections import Counter
from tqdm.notebook import tqdm as tqdm_notebook
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from transformers import DistilBertTokenizer, DistilBertModel, logging
from matplotlib.figure import Figure
from matplotlib import cm
import matplotlib.colors as colors
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

logging.set_verbosity_error()

In [2]:
import sys
sys.path.insert(0, '../../src/models/')
sys.path.insert(0, '../../src/features/')

import predict_model
from predict_model import loadBERT
from predict_model import SpanPredictor as classify
from build_features import text_cleaner
%matplotlib inline

In [3]:
model = loadBERT("../../models/", 'saved_weights_inf_FIXED_boot_beta80.pt')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

CPU Success


In [21]:
# URL
URL = 'https://en.wikipedia.org/wiki/Glossary_of_plant_morphology'
# Get the page
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, "lxml", from_encoding="iso-8859-1")   

glossary = collections.defaultdict(list)
# Find all H4 
for chapter in soup.find_all('h4')[0:]:
    # Clean
    chapter_text = chapter.text.rstrip('[edit]')
    # Find all siblings
    for sibling in chapter.find_next_siblings():
        # Find the parent
        for parent in sibling.find_previous_sibling('h4'):
            # Only append if correspond to current chapter
            if parent.text == chapter_text:
                if 'â' in sibling.text:
                    for tag in sibling.find_all('li'):
                        candidates = tag.text.split('â')[0]
                        candidates = candidates.split('/')
                        for candidate in candidates:
                            glossary[chapter_text.lower()].append(candidate.strip().lower())  
                            
glossary['leaves'] += [
    'glume', 'surface', 'margin',
    'leaves', 'auricles', 'spatheole',
    'ovate', 'lanceolate',
]

glossary['basic flower parts'] += [
    'floret', 'awn',
    
]
glossary['inflorescences'] += [
    'spikelets', 'lemma', 'racemes',
    'axis',
]
glossary['leaves'] += [
    'rhachilla'
]

glossary['other parts'] += [
    'apex', 'culm', 'tube',
    'palea', 'crown', 'canopy',
    'base', 'callus', 'hair',
    'anther',

]

glossary['plant property'] += [
    'tree', 'shrub',
    'life-span', 'life', 'span',
]

#with open('../../data/glossaries/plants.pkl', 'wb') as f:
#    pickle.dump(glossary, f)      

In [22]:
compounds = [
    'fertile', 'sterile',
    'male', 'female', 'bisexual',
    'basal', 'developed', 
    'primary', 'secondary', 'main',
    'upper', 'lower', 'greater', 'dorsal', 'alternate', 'lesser', 'apex', 'outer',
    'central', 'outermost', 'outer', 'inner', 'uppermost', 'median', 'dorsal', 'central', 'lateral',
]

#with open('../../data/glossaries/plants_compounds.pkl', 'wb') as f:
#    pickle.dump(compounds, f)     

In [35]:
#data = pickle.load(open('../../data/description/04_TRAIN_0000000-0014557_PLANTS.pkl', 'rb'))
data = pickle.load(open('../../data/processed/PlantNET_plants_SUBSET.pkl', 'rb'))

In [36]:
def compound_reconstructor(token, doc):
    if token.i == 0:
        trait = token
    elif doc[token.i - 1].pos_ == 'DET':
        trait = token
    elif doc[token.i - 3].dep_ == 'compound':
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 3].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 2].dep_ == 'compound':
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 2].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 1].dep_ == 'compound':
        trait = doc[token.i - 1: token.i + 1]
    elif doc[token.i - 1].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 1: token.i + 1]
    else:
        trait = token   
    return trait.lemma_

def check_existance(t, doc):
    
    # Check prep
    single = next((key for key, value in glossary.items() if t.lemma_.lower() in value), None)
    multi = next((key for key, value in glossary.items() if t.text.lower() in value), None)
    if single:
        return single
    elif multi:
        return multi
    else:
        return None

def extract_advmod(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'advmod':
            return child
        
def extract_nummod(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'nummod':
            return child

def extract_conjunction(t, doc):
    """HELPER"""
    if t.dep_ == 'conj' and t.pos_ == 'ADJ':
        return t 

def extract_amod(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'amod':
            return child
        
def extract_measurements(t, doc):
    """HELPER"""
    obj = None
    measurements = ['wide', 'long', 'high',]
    if t.text in measurements or t.lemma_ in measurements:
        obj = doc[t.left_edge.i : t.right_edge.i + 1]
    return obj

def extract_prepositions(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'prep':
            return doc[child.left_edge.i : child.right_edge.i + 1]

def define_position(x, y, doc):
    """HELPER"""
    if len(x.text.split()) > 1:
        return f'{y.text} {x.text}'
    else:
        try:
            if x.i > y.i:
                return doc[y.i : x.i + 1]
            else:
                return doc[x.i : y.i + 1]
        except:
            return f'{y.text} {x.text}'


def extract_verb_advm(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'advmod':
            return child

def extract_verb_nmod(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'nummod':
            return doc[child.left_edge.i : child.right_edge.i + 1] 
        
def extract_verb_prep(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'prep':
            return child    

def extract_verb_pobj(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'pobj' or child.dep_ == 'pcomp' or child.dep_ == 'prep':
            return child
        
def extract_verb_nmod(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'acl':
            return doc[child.left_edge.i : child.right_edge.i + 1]         
        
def extract_verb_dobj(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'dobj':
            return child
        
def extract_verb_orpd(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'oprd':
            return child    
        
def extract_verb_agnt(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'agent':
            return child 
        
def extract_verb_acomp(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'acomp':
            obj = doc[child.left_edge.i : child.right_edge.i + 1].text.lower()
            return obj 
        
def extract_verb_attr(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'attr':
            obj = doc[child.left_edge.i : child.right_edge.i + 1].text.lower()
            return obj    
        
def extract_nounandverb_nummods(t, doc):
    obj = None
    for child in t.children:
        if child.dep_ == 'nummod':
            obj = doc[child.left_edge.i : child.right_edge.i + 1]
            return obj   

def extract_dnummod(t, doc):
    obj = extract_nounandverb_nummods(t, doc)
    if obj:
        return obj.text
    else:
        return None

def extract_noun_adjectives(t, doc):
    adjs = []
    for child in t.children:
        if child.dep_ == 'appos' or child.dep_ == 'compound':
            continue
        if child.pos_ =='ADJ' or child.tag_ == 'VBN' and child.dep_ in ['conj', 'amod']:
            if child.lemma_.lower() in compounds or child.text.lower() in compounds:
                continue
            measurement = extract_measurements(child, doc)
            if measurement:
                obj = measurement
                adjs.append(obj)
            amod = extract_amod(child, doc)
            if amod:
                obj = define_position(amod, child, doc)
                adjs.append(obj)
            advmod = extract_advmod(child, doc)
            if advmod:
                obj = define_position(advmod, child, doc)
                adjs.append(obj)
            prep = extract_prepositions(child, doc)
            if prep:
                obj = define_position(prep, child, doc)
                adjs.append(obj)
            nummod = extract_nummod(child, doc)
            if nummod:
                obj = define_position(nummod, child, doc)
                adjs.append(obj)                
            if not any((measurement, amod, advmod, prep, nummod)): 
                obj = child
                adjs.append(obj)
            for grandchild in child.subtree:
                conj = extract_conjunction(grandchild, doc)
                if conj:
                    advmod = extract_advmod(conj, doc)
                    if advmod:
                        obj = define_position(advmod, conj, doc)
                        adjs.append(obj)
                    prep = extract_prepositions(conj, doc)
                    if prep:
                        obj = define_position(prep, conj, doc)
                        adjs.append(obj)
                    nummod = extract_nummod(conj, doc)
                    if nummod:
                        obj = define_position(nummod, conj, doc)
                        adjs.append(obj)
                    if not any((advmod, prep, nummod)):
                        obj = conj
                        adjs.append(obj)
                        
    return clean_adjectives(adjs)

def extract_noun_appos(t, doc):
    appos = []
    for child in t.children:
        if child.dep_ == 'appos':
            obj = doc[child.left_edge.i : child.right_edge.i + 1].text.lower()
            for obj_split in obj.split(','):
                appos.append(obj_split.strip())
    return appos

def extract_noun_prep(t, doc):
    preps = []
    for child in t.children:
        if child.dep_ == 'prep':
            obj = doc[child.left_edge.i : child.right_edge.i + 1].text.lower()
            for obj_split in obj.split(','):
                preps.append(obj_split.strip())
    return preps

def check_species(t, species, doc):
    if t.text in species.split():
        return True
    else:
        return False
    
def extract_noun_verbs_ROOT(t, doc):
    relations = []
    objects = []
    if t.dep_ not in ['ROOT', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
        return '', ''
    parent = next((parent for parent in t.ancestors), None)
    if parent and parent.pos_ == 'VERB' or parent and parent.pos_ == 'AUX':
        try:
                        
            prep = extract_verb_prep(parent, doc)
            if prep: 
                relations.append(f'{parent.text} {prep}'), objects.append(extract_verb_pobj(prep, doc).lemma_)
            dobj = extract_verb_dobj(parent, doc)
            if dobj: 
                relations.append(parent.text), objects.append(extract_verb_pobj(dobj, doc).lemma_)
            oprd = extract_verb_orpd(parent, doc)
            if oprd: 
                relations.append(parent.text), objects.append(oprd.text)
            agnt = extract_verb_agnt(parent, doc)
            if agnt: 
                relations.append(f'{parent.text} {agnt}'), objects.append(extract_verb_pobj(agnt, doc).lemma_)
            nmod = extract_verb_nmod(parent, doc)
            if nmod: 
                print(nmod)
            advm = extract_verb_advm(parent, doc)
            if advm: 
                relations.append(parent.text), objects.append(advm.text)
            acomp = extract_verb_acomp(parent, doc)
            if acomp: 
                relations.append(parent.text), objects.append(acomp) # Already text
            attr = extract_verb_attr(parent, doc)
            if attr:
                relations.append(parent.text), objects.append(attr) # Already text
            if not any((prep, dobj, oprd, agnt, nmod, advm, acomp, attr)): 
                relations.append('are'), objects.append(parent.text)
        except:
            print(333)
            pass
       
    return clean_verbs(relations, objects)

def extract_noun_verbs_NON_ROOT(t, doc):
    relations = []
    objects = []
    if t.dep_ not in ['ROOT', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
        return '', ''
    # Double check
    parent = next((parent for parent in t.ancestors), None)
    if not parent:
        try:
            for child in t.children:
                if child.pos_ == 'VERB' and child.dep_ != 'amod':
                    print(child)
                    
                    prep = extract_verb_prep(child, doc)
                    if prep: 
                        noun = extract_verb_pobj(prep, doc)
                        relations.append(f'{child.text} {prep}')
                        objects.append(doc[noun.left_edge.i : noun.right_edge.i + 1].text)
                    dobj = extract_verb_dobj(child, doc)
                    if dobj:
                        relations.append(child.text)
                        objects.append(dobj.lemma_) 
                    oprd = extract_verb_orpd(child, doc)
                    if oprd:
                        oprd_prep = extract_verb_prep(oprd, doc)
                        if oprd_prep:
                            relations.append(f'{child.text} {oprd.text}')
                            objects.append(doc[oprd_prep.left_edge.i : oprd_prep.right_edge.i + 1].text) 
                        else:
                            relations.append(child.text), objects.append(oprd.text)
                    agnt = extract_verb_agnt(child, doc)
                    if agnt:
                        noun = extract_verb_pobj(agnt, doc)
                        relations.append(f'{child.text} {agnt.text}')
                        objects.append(doc[noun.left_edge.i : noun.right_edge.i + 1].text)
                    nmod = extract_verb_nmod(child, doc)
                    if nmod:
                        relations.append('is')
                        objects.append(f'{nmod.text} {child}') 
        except:
            pass
                        
    return clean_verbs(relations, objects)


def clean_verbs(relations, objects):
    
    rel = []
    obj = []
    for relation, object_ in zip(relations, objects):
        #rel.append(relation)
        #obj.append(object_.split(',')[0])
        
        for obj_split in re.split(',|and', object_):
            #print(relation, obj_split)
            rel.append(relation.lower().strip())
            obj.append(obj_split.lower().strip())            
            
    return rel, obj

def clean_adjectives(adjs):
    
    adjectives = []
    for adj in adjs:
        try:
            if adj.pos_ == 'VERB':
                adj_text = adj.text.lower()
            elif adj.root.pos_ == 'VERB':
                adj_text = adj.text.lower()
            else:
                adj_text = adj.lemma_.lower()
        except:
                if type(adj) == str:
                    adj_text = adj.lower()
                else:
                    adj_text = adj.text.lower()
        for adj_split in adj_text.split(','):
            adjectives.append(adj_split.strip())
    
    return adjectives
        
def adjective_reverser(doc):
    root = next(t for t in doc if t.dep_ == 'ROOT')
    if root.pos_ == 'ADJ':
        
        span_1 = doc[root.i - 1].text.lower()
        span_2 = root.text.capitalize()
        span_3 = doc[root.i + 1 : ].text
        
        text = f'{span_2} {span_1} {span_3}'
        doc = nlp(text)
        
        print('asd')
    
    return doc        

In [38]:
descriptions = collections.defaultdict(list)

# For plotting purposes
parts = []
traits = []
for species in tqdm_notebook(list(data.keys())[2:3]):
    for idx, text in enumerate(data[species][0:10]):

        # Clean the text
        text = re.sub(r'(?<!\d)\.(?!\d)', ' ', text)
        text = re.sub(r'\s×\s', ' times ', text)
        text = re.sub(r'\xa0', ' ', text)
        text = f'{text.strip()}.'
        # Reset variables
        part=trait=rel=obj=adjectives = None 
        # NLP
        doc = nlp(text)
        # Init
        descriptions[species, idx] = []
        triples = []
        # Loop over tokens
        for t in doc:
            if t.dep_ == 'compound':
                continue
            ### SUBJECTS ###    
            if t.pos_ == 'NOUN' or t.pos_ == 'PROPN':
                # Check existance of parts
                part = check_existance(t, doc)
                if part:
                    # Reconstruct Compounds & Append
                    trait = compound_reconstructor(t, doc)
                    triples.append(('species', 'has main part', part))
                    triples.append((part, f'has sub part', trait))
                    # NOUN ADJECTIVES
                    adjectives = extract_noun_adjectives(t, doc)
                    for adjective in adjectives:
                        triples.append((trait, 'is', adjective))
                    # NOUN ROOT VERBS
                    verbs_rel, verbs_obj = extract_noun_verbs_ROOT(t, doc)
                    for rel, obj in zip(verbs_rel, verbs_obj):
                        triples.append((trait, rel, obj))
                    # NOUN NON ROOT VERBS
                    verbs_rel, verbs_obj = extract_noun_verbs_NON_ROOT(t, doc)
                    for rel, obj in zip(verbs_rel, verbs_obj):
                        triples.append((trait, rel, obj))
                    # NOUN APPOSITIONAL MODIFIER
                    adjectives = extract_noun_appos(t, doc)
                    for adjective in adjectives:
                        triples.append((trait, 'is', adjective))
                    # NOUN NUMMODS
                    nummod = extract_dnummod(t, doc)
                    triples.append((trait, 'is', nummod))
                    # NOUN PREPOSITIONAL MODIFIER
                    adjectives = extract_noun_prep(t, doc)
                    for adjective in adjectives:
                        triples.append((trait, 'is', adjective))                

        # APPEND
        descriptions[species, idx] = [triple for triple in triples if all(triple)]     
        
                    
        print(idx, doc)
        print(descriptions[species, idx])
        print('\n')

  0%|          | 0/1 [00:00<?, ?it/s]

0 Simple perforation rim, vessel-vessel pits.
[]


1 Axial parenchyma   Axial parenchyma paratracheal and confluent to banded.
[]


2 Habit:   Shrub to 4 m tall, occasionally a small tree to 7 m   Distribution:  Introduced.
[('species', 'has main part', 'plant property'), ('plant property', 'has sub part', 'shrub'), ('shrub', 'is', 'a small tree to 7 m'), ('shrub', 'is', 'to 4 m tall'), ('species', 'has main part', 'plant property'), ('plant property', 'has sub part', 'tree'), ('tree', 'is', 'small'), ('tree', 'is', 'to 7 m')]


3 Rays 1-2 cells wide.
[('species', 'has main part', 'basic flower parts'), ('basic flower parts', 'has sub part', 'cell'), ('cell', 'is', '1-2')]


4 Longitudinal surface.
[('species', 'has main part', 'leaves'), ('leaves', 'has sub part', 'surface'), ('surface', 'is', 'longitudinal')]


5 Rays  Rays 1-2 cells wide with uniseriate rays present.
[('species', 'has main part', 'basic flower parts'), ('basic flower parts', 'has sub part', 'cell'), ('cell', 'is', '

In [34]:
displacy.render(doc)

In [26]:
def extract_noun_verbs_ROOT(t, doc):
    relations = []
    objects = []
    if t.dep_ not in ['ROOT', 'nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
        return '', ''
    parent = next((parent for parent in t.ancestors), None)
    if parent and parent.pos_ == 'VERB' or parent and parent.pos_ == 'AUX':
        try:
                        
            prep = extract_verb_prep(parent, doc)
            if prep: 
                relations.append(f'{parent.text} {prep}'), objects.append(extract_verb_pobj(prep, doc).lemma_)
            dobj = extract_verb_dobj(parent, doc)
            if dobj: 
                relations.append(parent.text), objects.append(extract_verb_pobj(dobj, doc).lemma_)
            oprd = extract_verb_orpd(parent, doc)
            if oprd: 
                relations.append(parent.text), objects.append(oprd.text)
            agnt = extract_verb_agnt(parent, doc)
            if agnt: 
                relations.append(f'{parent.text} {agnt}'), objects.append(extract_verb_pobj(agnt, doc).lemma_)
            nmod = extract_verb_nmod(parent, doc)
            if nmod: 
                print(nmod)
            advm = extract_verb_advm(parent, doc)
            if advm: 
                relations.append(parent.text), objects.append(advm.text)
            acomp = extract_verb_acomp(parent, doc)
            if acomp: 
                relations.append(parent.text), objects.append(acomp) # Already text
            if not any((prep, dobj, oprd, agnt, nmod, advm, acomp)): 
                relations.append('are'), objects.append(parent.text)
        except:
            print(333)
            pass

In [27]:
extract_noun_verbs_ROOT(doc[1],doc)

('', '')