In [65]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from matplotlib.colors import is_color_like as color_check
import requests
import random
import pickle
import re
import spacy
nlp = spacy.load('en_core_web_trf')
from spacy import displacy
import collections
from collections import Counter
from tqdm.notebook import tqdm as tqdm_notebook
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from transformers import DistilBertTokenizer, DistilBertModel, logging
from matplotlib.figure import Figure
from matplotlib import cm
import matplotlib.colors as colors
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [2]:
# URL
URL = 'https://en.wikipedia.org/wiki/Glossary_of_plant_morphology'
# Get the page
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, "lxml", from_encoding="iso-8859-1")   

glossary = collections.defaultdict(list)
# Find all H4 
for chapter in soup.find_all('h4')[0:]:
    # Clean
    chapter_text = chapter.text.rstrip('[edit]')
    # Find all siblings
    for sibling in chapter.find_next_siblings():
        # Find the parent
        for parent in sibling.find_previous_sibling('h4'):
            # Only append if correspond to current chapter
            if parent.text == chapter_text:
                if 'â' in sibling.text:
                    for tag in sibling.find_all('li'):
                        candidates = tag.text.split('â')[0]
                        candidates = candidates.split('/')
                        for candidate in candidates:
                            glossary[chapter_text.lower()].append(candidate.strip().lower())  
                            
glossary['leaves'] += [
    'glume', 'surface', 'margin',
    'leaves', 'auricles', 'spatheole',
    'ovate', 'lanceolate',
]

glossary['basic flower parts'] += [
    'floret', 'awn',
    'pod', 'lobe', 
    'capitulum', 'capitula', # unkown
    'legume', 'calyx', 
]
glossary['inflorescences'] += [
    'spikelets', 'lemma', 'racemes',
    'axis', 'cluster', 
]
glossary['leaves'] += [
    'rhachilla',
    'needles',
]

glossary['other parts'] += [
    'apex', 'culm', 'tube',
    'palea', 'crown', 'canopy',
    'base', 'callus', 'hair',
    'anther', 'tuberculate', 'cone',
    'shoot',

]

glossary['plant property'] += [
    'tree', 'shrub',
    'life-span', 'life', 'span',
    'bloom-time', 'species', 'wood', 'timber',
    'color', 'colour',
    
]

In [20]:
compounds = [
    'fertile', 'sterile',
    'male', 'female', 'bisexual',
    'basal', 'developed', 
    'primary', 'secondary', 'main',
    'upper', 'lower', 'greater', 'dorsal', 'alternate', 'lesser', 'apex', 'outer',
    'central', 'outermost', 'outer', 'inner', 'uppermost', 'median', 'dorsal', 'central', 'lateral',
    'young', 'mature', 'individual', 
]

rubbish = [
    '.', ',', '-',
]

measurements = [
    'mm', 'cm', 'm', 'km',
    'milimeter', 'centimeter', 'meter', 'kilometer',
    'milimetre', 'centimetre', 'metre', 'kilometre',
    'inch', 'foot', 'yard', 'mile',
    'wide', 'long', 'broad', 'tall',
    'length',
]


In [126]:
def compound_reconstructor(token, doc):
    if token.i == 0:
        trait = token
    elif doc[token.i - 1].pos_ == 'DET':
        trait = token
    elif doc[token.i - 3].dep_ == 'compound':
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 3].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 2].dep_ == 'compound':
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 2].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 1].dep_ == 'compound':
        trait = doc[token.i - 1: token.i + 1]
    elif doc[token.i - 1].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 1: token.i + 1]
    else:
        trait = token
    if ','  in trait.lemma_:
        trait = token
    return trait.lemma_    

def check_existance(t, doc):
    
    if t.i + 1 < len(doc) and doc[t.i + 1].lemma_ == '-':
        return None
    # Check prep
    single = next((key for key, value in glossary.items() if t.lemma_.lower() in value), None)
    multi = next((key for key, value in glossary.items() if t.text.lower() in value), None)
    if single:
        return single
    elif multi:
        return multi
    else:
        return None

def text_preparation(species, text):
    
    cleaners = [(r'(?<!\d)\.(?!\d)', ' '),
                (r'\s×\s', ' times '),
                #(r'\xa0', ' '),
                (r'\s+c\s+', ' '),
                (r'â\x80\x93', ' to '),
                (r'\xa0', ' '),
                (r'\x97', ''),]
    
    species_parts = species.split()
    candidates = [' '.join(species_parts[:idx+1]) for idx, _ in enumerate(species_parts)]
    candidates += [
        f'{species_parts[0][0]}. {species_parts[1]}'
    ]
    candidates.reverse()
    for candidate in candidates:
        try:
            text = re.sub(candidate, 'the species', text)
        except:
            continue # Skip species with brackets for now
    for (cleaner, replacement) in cleaners:
        text = re.sub(cleaner, replacement, text)    
    text = f'{text.strip()}.'
    
    return text.capitalize()


def extract_nsubjects(t, doc, attribute):
    relations = []
    objects = []
    if t.dep_  == 'nsubj':
        verb = next((parent for parent in t.ancestors if parent.pos_ == 'VERB'), None)
        if verb:
            if verb.lemma_ in ['be',]:
                relations.append('property')
            else:
                relations.append(verb.text)
            objects += [child.lemma_ for child in verb.children if child.dep_ == attribute]
    return relations, objects

def extract_mods(t, doc, attribute):
    relations = []
    objects = []
    for child in t.children:
        if child.dep_ == attribute and child.text.lower() not in compounds:
            if child.lemma_ in measurements:
                relations.append('measurement')
                objects.append(doc[child.left_edge.i : child.right_edge.i + 1].lemma_)
            elif color_check(child.lemma_):
                relations.append('color')
                objects.append(doc[child.left_edge.i : child.right_edge.i + 1].lemma_) 
            else:
                relations.append('property')
                objects.append(doc[child.left_edge.i : child.right_edge.i + 1].lemma_)     
    return clean_verbs(relations, objects)

def extract_appos(t, doc):
    relations = []
    objects = []
    for child in t.children:
        if child.dep_ == 'appos' and child.text not in compounds:
            if child.lemma_ in measurements:
                relations.append('measurement')
                objects.append(doc[child.left_edge.i : child.right_edge.i + 1].lemma_)
            elif child.pos_ == 'NOUN':
                if color_check(child.lemma_):
                    relations.append('color')
                else:
                    relations.append('has sub part')
                objects.append(child.lemma_)
                #appos_rel, appos_obj = extract_information(child, doc)
                #for ar, ao in zip(appos_rel, appos_obj):
                #    relations.append(ar)
                #    objects.append(ao)
            elif child.pos_ == 'ADJ':
                relations.append('property')
                objects.append(doc[child.left_edge.i : child.right_edge.i + 1].lemma_)
            else:
                print(3)
    return clean_verbs(relations, objects)

def clean_verbs(relations, objects):
    rel = []
    obj = []
    for relation, object_ in zip(relations, objects):       
        for obj_split in re.split(',| and | or | with ', object_):
            relation = relation.lower().strip()
            if color_check(relation) or color_check(relation.split()[-1]):
                relation = 'color'
            obj_split = obj_split.lower().strip()
            if obj_split:
                rel.append(relation)
                print(relations, obj_split)
                obj.append(obj_split)                    
    return rel, obj

def extract_information(t, doc):
    
    verbal_attributes = [
        "acomp", "dobj",
    ]
    noun_attributes = [
        "amod", "nummod",
    ]
    
    relations_list = []
    objects_list = []
    
    for attribute in verbal_attributes:
            relations, objects = extract_nsubjects(t, doc, attribute)
            relations_list += relations
            objects_list   += objects
    
    for attribute in noun_attributes:
            relations, objects = extract_mods(t, doc, attribute)
            relations_list += relations
            objects_list   += objects
            
    
    relations, objects = extract_appos(t, doc)
    relations_list += relations
    objects_list   += objects    
    
    return relations_list, objects_list

In [127]:
#data = pickle.load(open('../../data/PlantNet/descriptions_raw.pkl', 'rb'))
#data = pickle.load(open('../../data/description/04_TRAIN_0000000-0014557_PLANTS.pkl', 'rb'))

In [128]:


for species in tqdm_notebook(list(data.keys())[0:1]):
    for idx, text in enumerate(data[species][2:3]):
        triples = []
        text = text_preparation(species, text)
        doc = nlp(text)
        for t in doc:
            if t.pos_ == 'NOUN' or t.pos_ == 'PROPN' or t.pos_ == 'PRON':
                if t.dep_ == 'compound':
                    continue
                part = check_existance(t, doc)
                if part: 
                    # Trait
                    trait = compound_reconstructor(t, doc)

                    relations, objects = extract_information(t, doc)
                    for rel, obj in zip(relations, objects):
                        triples.append(('species', 'has main part', part))
                        triples.append((part, f'has part', trait))
                        triples.append((trait, rel, obj))

        print(text)
        print(idx, triples)
        print('\n')
                    

  0%|          | 0/1 [00:00<?, ?it/s]

['color'] grayish brown
['color'] dark reddish brown
['has sub part'] shoot
['property', 'property'] glabrous
['property', 'property'] short
Branches grayish brown or dark reddish brown, glabrous, short shoots of young plants often thorn-tipped.
0 [('species', 'has main part', 'stems'), ('stems', 'has part', 'branch'), ('branch', 'tipped', 'grayish brown'), ('species', 'has main part', 'stems'), ('stems', 'has part', 'branch'), ('branch', 'tipped', 'dark reddish brown'), ('species', 'has main part', 'stems'), ('stems', 'has part', 'branch'), ('branch', 'color', 'shoot'), ('species', 'has main part', 'other parts'), ('other parts', 'has part', 'shoot'), ('shoot', 'property', 'glabrous'), ('species', 'has main part', 'other parts'), ('other parts', 'has part', 'shoot'), ('shoot', 'property', 'short')]




In [112]:
displacy.render(doc)

In [109]:
displacy.render(nlp('Stems often with spiny lower branches'))