In [14]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from matplotlib.colors import is_color_like as color_check
import requests
import random
import pickle
import re
import spacy
nlp = spacy.load('en_core_web_trf')
from spacy import displacy
import collections
from collections import Counter
from tqdm.notebook import tqdm as tqdm_notebook
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from transformers import DistilBertTokenizer, DistilBertModel, logging
from matplotlib.figure import Figure
from matplotlib import cm
import matplotlib.colors as colors
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [15]:
# URL
URL = 'https://en.wikipedia.org/wiki/Glossary_of_plant_morphology'
# Get the page
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, "lxml", from_encoding="iso-8859-1")   

glossary = collections.defaultdict(list)
# Find all H4 
for chapter in soup.find_all('h4')[0:]:
    # Clean
    chapter_text = chapter.text.rstrip('[edit]')
    # Find all siblings
    for sibling in chapter.find_next_siblings():
        # Find the parent
        for parent in sibling.find_previous_sibling('h4'):
            # Only append if correspond to current chapter
            if parent.text == chapter_text:
                if 'â' in sibling.text:
                    for tag in sibling.find_all('li'):
                        candidates = tag.text.split('â')[0]
                        candidates = candidates.split('/')
                        for candidate in candidates:
                            glossary[chapter_text.lower()].append(candidate.strip().lower())  
                            
glossary['leaves'] += [
    'glume', 'surface', 'margin',
    'leaves', 'auricles', 'spatheole',
    'ovate', 'lanceolate',
]

glossary['basic flower parts'] += [
    'floret', 'awn',
    'pod', 'lobe', 
    'capitulum', 'capitula', # unkown
    'legume', 'calyx', 'flowerhead'
]
glossary['inflorescences'] += [
    'spikelets', 'lemma', 'racemes',
    'axis', 'cluster', 
]
glossary['leaves'] += [
    'rhachilla',
    'needles',
]

glossary['other parts'] += [
    'apex', 'culm', 'tube',
    'palea', 'crown', 'canopy',
    'base', 'callus', 'hair',
    'anther', 'tuberculate', 'cone',
    'shoot', 'gland',

]

glossary['plant property'] += [
    'tree', 'shrub', 'plant',
    'life-span', 'life', 'span',
    'bloom-time', 'species', 'wood', 'timber',
    'color', 'colour', 
    
]

glossary['stems'] += [
    'branchlet', 
    
]

In [16]:
glossary.keys()

dict_keys(['morphology', 'roots', 'stems', 'buds', 'leaves', 'basic flower parts', 'inflorescences', 'insertion of floral parts', 'union of flower parts', 'flower sexuality and presence of floral parts', 'flower symmetry', 'terms for fruits', 'fruit types', 'pteridophytes', 'bryophytes', 'other parts', 'plant property'])

In [17]:
compounds = [
    'fertile', 'sterile',
    'male', 'female', 'bisexual',
    'basal', 'developed', 
    'primary', 'secondary', 'main',
    'upper', 'lower', 'greater', 'dorsal', 'alternate', 'lesser', 'apex', 'outer',
    'central', 'outermost', 'outer', 'inner', 'uppermost', 'median', 'dorsal', 'central', 'lateral',
    'young', 'mature', 'individual', 
    'opposite', 
]

rubbish = [
    '.', ',', '-', '..', '...',
]

measurements = [
    'mm', 'cm', 'm', 'km',
    'milimeter', 'centimeter', 'meter', 'kilometer',
    'milimetre', 'centimetre', 'metre', 'kilometre',
    'inch', 'foot', 'yard', 'mile',
    'wide', 'long', 'broad', 'tall',
    'length', 'form',
]


In [18]:
def compound_reconstructor(token, doc):
    if token.i == 0:
        trait = token
    elif doc[token.i - 1].pos_ == 'DET':
        trait = token
    elif doc[token.i - 3].dep_ == 'compound':
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 3].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 2].dep_ == 'compound':
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 2].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 1].dep_ == 'compound':
        trait = doc[token.i - 1: token.i + 1]
    elif doc[token.i - 1].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 1: token.i + 1]
    else:
        trait = token
    if ','  in trait.lemma_:
        trait = token
    return trait.lemma_    

def check_existance(t, doc):
    
    if t.i + 1 < len(doc) and doc[t.i + 1].lemma_ == '-':
        return None
    # Check prep
    single = next((key for key, value in glossary.items() if t.lemma_.lower() in value), None)
    multi = next((key for key, value in glossary.items() if t.text.lower() in value), None)
    if single:
        return single
    elif multi:
        return multi
    else:
        return None

def text_preparation(species, text):
    
    cleaners = [(r'(?<!\d)\.(?!\d)', ' '),
                (r'\s×\s', ' times '),
                #(r'\xa0', ' '),
                (r'\s+c\s+', ' '),
                (r'â\x80\x93', ' to '),
                (r'\xa0', ' '),
                (r'\x97', ''),
                (r'\s{2,}', ' ')]
    
    species_parts = species.split()
    candidates = [' '.join(species_parts[:idx+1]) for idx, _ in enumerate(species_parts)]
    candidates += [
        f'{species_parts[0][0]}. {species_parts[1]}'
    ]
    candidates.reverse()
    for candidate in candidates:
        try:
            text = re.sub(candidate, 'the species', text)
        except:
            continue # Skip species with brackets for now
    for (cleaner, replacement) in cleaners:
        text = re.sub(cleaner, replacement, text)    
    text = f'{text.strip()}.'
    
    return text.capitalize()


def extract_modifiers(t, doc):
    if t.text.lower() not in compounds:
        if t.dep_ in ['amod', 'nummod', 'appos', 'acl','prep', 'conj']:
            return doc[t.left_edge.i : t.right_edge.i + 1]
    
        
def create_relation(t):
    relation = ''
    if t in measurements or list(set(t.split()) & set(measurements)):
        relation = 'measures'  #'measurement'
    elif t.isdigit():
        relation = 'has number'
    elif color_check(t) or color_check(t.split()[-1]) or color_check(t.split('-')[-1]):
        relation = 'has color'
    else:
        relation = 'is' # Property
        
    return relation
        
def clean_object(t):
    
    #print(chunk_1, chunk_2)
    if t.root.pos_ == 'NOUN' and t.root.lemma_ not in measurements:
        objects = t.root.lemma_
    else:
        if len(t) > 1:
            objects =  t.text
        elif t.root.pos_ == 'VERB':
            objects = t.text
        else:
            objects = t.lemma_

    objects =  re.split(',| and | or | with ', objects)

    return [obj.strip() for obj in objects if obj if obj not in rubbish]

def extract_verb(t, doc):
    if t.dep_  == 'nsubj':
        return next((parent for parent in t.ancestors if parent.pos_ == 'VERB' or parent.pos_ == 'AUX'), None)

def extract_verbal_modifier(t, doc):
    if t.text.lower() not in compounds:
        if child.dep_ in ["acomp", "dobj", "prep",]:
            return doc[child.left_edge.i : child.right_edge.i + 1]    
    
def create_main_triples(part, trait, obj):
    triples = []
    triples.append(('species', 'has main part', part.lower(), text))
    triples.append((part.lower(), f'has part', trait.lower(), text))
    for o in obj:
        rel = create_relation(o)
        triples.append((trait.lower(), rel.lower(), o.lower(), text))
    return triples

def create_sub_triples(sub, obj):
    triples = []
    for o in obj:
        rel = create_relation(o)
        triples.append((sub.lower(), rel.lower(), o.lower(), text))
    return triples

def noun_check(t):
    if t.root.pos_ == 'NOUN' and t.root.lemma_ not in measurements and not color_check(t.root.lemma_):
        return True
    else:
        return False

In [19]:
data = pickle.load(open('../../data/PlantNet/descriptions_raw.pkl', 'rb'))
#data = pickle.load(open('../../data/description/04_TRAIN_0000000-0014557_PLANTS.pkl', 'rb'))

In [20]:
data = {
    'Test Tree':[
        'The short inflorescence of the species.',
    ]
}

In [22]:
descriptions = collections.defaultdict(list)

for species in tqdm_notebook(list(data.keys())[0:1]):
    for idx, text in enumerate(data[species][0:10]):
        #print(text)
        triples = []
        text = text_preparation(species, text)
        doc = nlp(text)
        
        # TEMP ESCAPE
        try:
            if doc[-2].text in rubbish and doc[-3].text in rubbish:
                continue
            spaces = [t for t in doc if t.pos_ == 'SPACE']
            if len(spaces) > 1 and 'species' in doc.text.lower():
                continue
            if doc[-1].text in ['..', '...']:
                continue
        except:
            pass

        
        for t in doc:

            if t.pos_ == 'NOUN' or t.pos_ == 'PROPN' or t.pos_ == 'PRON':
                if t.dep_ == 'compound':
                    continue
                part = check_existance(t, doc)
                if part: 
                    # Trait
                    trait = compound_reconstructor(t, doc)
                    ## ADJs and NOUNs
                    for child in t.children:
                        obj_tok  = extract_modifiers(child, doc)
                        if obj_tok:
                            obj = clean_object(obj_tok)
                            triples += create_main_triples(part, trait, obj)
                            # modifiers of NOUNS
                            if noun_check(obj_tok):
                                for child in obj_tok.root.children:
                                    obj_tok = extract_modifiers(child, doc)
                                    if obj_tok:
                                        obj_new = clean_object(obj_tok)
                                        triples += create_sub_triples(obj[0], obj_new)
                                        
                    ## VERBs
                    verb = extract_verb(t, doc)
                    if verb:
                        for child in verb.children:
                            obj_tok  = extract_verbal_modifier(child, doc)
                            if obj_tok:
                                obj = clean_object(obj_tok)
                                triples += create_main_triples(part, trait, obj)
                            
                        
        print(text)
        print(idx, triples)
        print('\n')
        descriptions[species] += triples

  0%|          | 0/1 [00:00<?, ?it/s]

The short inflorescence of the species.
0 [('species', 'has main part', 'bryophytes', 'The short inflorescence of the species.'), ('bryophytes', 'has part', 'inflorescence', 'The short inflorescence of the species.'), ('inflorescence', 'is', 'short', 'The short inflorescence of the species.'), ('species', 'has main part', 'bryophytes', 'The short inflorescence of the species.'), ('bryophytes', 'has part', 'inflorescence', 'The short inflorescence of the species.'), ('inflorescence', 'is', 'of the species', 'The short inflorescence of the species.')]




In [12]:
with open('../../data/PlantNet/descriptions_triples_raw_with_sentence.pkl', 'wb') as f:
    pickle.dump(descriptions, f)      


descriptions_text = collections.defaultdict(list)
descriptions_RDFs = collections.defaultdict(list)

for species in tqdm_notebook(descriptions.keys()):
    for (sub, rel, obj, original_text) in descriptions[species]:
        text = f'{sub} {rel} {obj}.'.capitalize()
        # Make sure order is the same
        if text not in descriptions_text[species]:
            descriptions_text[species].append(text)
            descriptions_RDFs[species].append((sub, rel, obj, original_text))
            
with open('../../data/PlantNet/descriptions_triples_text_with_sentence.pkl', 'wb') as f:
    pickle.dump(descriptions_text, f)      
    
with open('../../data/PlantNet/descriptions_triples_rdf_with_sentence.pkl', 'wb') as f:
    pickle.dump(descriptions_RDFs, f)   

  0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
with open('../../data/PlantNet/descriptions_triples_raw_v3.pkl', 'wb') as f:
    pickle.dump(descriptions, f)      

In [13]:
descriptions_text = collections.defaultdict(list)
descriptions_RDFs = collections.defaultdict(list)

for species in tqdm_notebook(descriptions.keys()):
    for (sub, rel, obj) in descriptions[species]:
        text = f'{sub} {rel} {obj}.'.capitalize()
        # Make sure order is the same
        if text not in descriptions_text[species]:
            descriptions_text[species].append(text)
            descriptions_RDFs[species].append((sub, rel, obj))

  0%|          | 0/1 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 3)

In [None]:
with open('../../data/PlantNet/descriptions_triples_text_v3.pkl', 'wb') as f:
    pickle.dump(descriptions_text, f)      
    
with open('../../data/PlantNet/descriptions_triples_rdf_v3.pkl', 'wb') as f:
    pickle.dump(descriptions_RDFs, f)   

In [None]:
displacy.render(doc)

In [None]:
len(glossary.keys())

In [None]:
glossary['leaves']