In [1]:
import spacy
import pickle
from spacy import displacy
nlp = spacy.load('en_core_web_trf')
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm as tqdm_notebook
import collections
import re
import pandas as pd
import numpy as np
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
from netgraph import Graph
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline

In [2]:
colors = list(mcolors.CSS4_COLORS.keys())

### Get Plant Glossary List

In [3]:
# URL
URL = 'https://en.wikipedia.org/wiki/Glossary_of_plant_morphology'
# Get the page
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, "lxml", from_encoding="iso-8859-1")   

glossary = collections.defaultdict(list)
# Find all H4 
for chapter in soup.find_all('h4')[0:]:
    # Clean
    chapter_text = chapter.text.rstrip('[edit]')
    # Find all siblings
    for sibling in chapter.find_next_siblings():
        # Find the parent
        for parent in sibling.find_previous_sibling('h4'):
            # Only append if correspond to current chapter
            if parent.text == chapter_text:
                if 'â' in sibling.text:
                    for tag in sibling.find_all('li'):
                        candidates = tag.text.split('â')[0]
                        candidates = candidates.split('/')
                        for candidate in candidates:
                            glossary[chapter_text.lower()].append(candidate.strip().lower())

### 

In [4]:
glossary['leaves'] += [
    'glume',
    'surface',
    'margin'
]

glossary['basic flower parts'] +=[
    'floret'
    
]
glossary['inflorescences'] += [
    'spikelets',
    'lemma',
    'racemes',
    'axis'
]
glossary['leaves'] += [
    'rhachilla'
]

glossary['other'] += [
    'apex'
    'culms'
]

### Load the data

In [None]:
data_unsorted = pickle.load(open('../data/description/04_TRAIN_0000000-0014557_PLANTS.pkl', 'rb'))

data_sorted =  {k: v for k, v in sorted(data_unsorted.items(), key = lambda item : len(item[1]), reverse=True)}

In [6]:
#data[species]

### Define Functions

### Extract data

In [8]:
descriptions_id = collections.defaultdict(list)
descriptions = collections.defaultdict(list)

### PLOTTING ###
parts = []
### PLOTTING ###

compounds = [
    'upper', 'lower', 'apex', 
    'dorsal', 'central', 'lateral',
    'fertile', 'sterile',
    'male', 'female',
]

def compound_reconstructor(token, doc):
    if token.i == 0:
        trait = token
    elif doc[token.i - 3].dep_ == 'compound':
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 3].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 2].dep_ == 'compound':
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 2].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 1].dep_ == 'compound':
        trait = doc[token.i - 1: token.i + 1]
    elif doc[token.i - 1].text.lower() in compounds or doc[token.i - 3].lemma_.lower() in compounds:
        trait = doc[token.i - 1: token.i + 1]
    else:
        trait = token   
    return trait.lemma_

def check_existance(t, doc):
    single = next((key for key, value in glossary.items() if t.lemma_.lower() in value), None)
    multi = next((key for key, value in glossary.items() if t.text.lower() in value), None)
    if single:
        return single
    elif multi:
        return multi
    else:
        return None

def extract_advmod(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'advmod':
            return child

def extract_conjunction(t, doc):
    """HELPER"""
    if t.dep_ == 'conj' and t.pos_ == 'ADJ':
        return t 
    
def extract_measurements(t, doc):
    """HELPER"""
    subj = None
    measurements = ['wide', 'long']
    if t.text in measurements or t.lemma_ in measurements:
        subj = doc[t.left_edge.i : t.right_edge.i + 1]
    return subj

def extract_prepositions(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'prep':
            return doc[child.left_edge.i : child.right_edge.i + 1]


def define_position(x, y, doc):
    """HELPER"""
    try:
        if x.i > y.i:
            return doc[y.i : x.i + 1]
        else:
            return doc[x.i : y.i + 1]
    except:
        if x[-1].i > y.i:
            return nlp(f'{y} {x}')
        else:
            return nlp(f'{x} {y}')

def extract_adjectives(t, doc):
    adjs = []
    for child in t.children:
        if child.dep_ == 'appos':
            continue
        if child.pos_ =='ADJ' or child.tag_ == 'VBN' and child.dep_ in ['conj', 'amod']:
            
            advmod = extract_advmod(child, doc)
            measurement = extract_measurements(child, doc)
            prep = extract_prepositions(child, doc)
            
            if child.lemma_.lower() in compounds:
                continue
            if child.text.lower() in compounds:
                continue
            elif advmod:
                subj = doc[advmod.i : child.i + 1]
                adjs.append(subj)
            
            elif measurement:
                subj = measurement
                adjs.append(subj)
            
            #print(prep)
            elif prep:
                subj = define_position(prep, child, doc)
                adjs.append(subj)
            else:
                subj = child
                adjs.append(subj)
            for grandchild in child.subtree:
                conj = extract_conjunction(grandchild, doc)
                if conj:
                    advmod = extract_advmod(conj, doc)
                    prep = extract_prepositions(conj, doc)
                    if advmod:
                        subj = define_position(advmod, conj, doc)
                        adjs.append(subj)
                    
                    elif prep:
                        subj = define_position(prep, conj, doc)
                        adjs.append(subj)
                    else:
                        subj = conj
                        adjs.append(subj)
    return adjs

def extract_appos_mods(t, doc):
    appos = []
    for child in t.children:
        if child.dep_ == 'appos':
            subj = doc[child.left_edge.i : child.right_edge.i + 1]
            appos.append(subj)
    return appos


def extract_verb_preps(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'prep':
            return child    

def extract_verb_subj(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'pobj':
            return child
        
def extract_verb_dobj(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'dobj':
            return child
def extract_verb_orpd(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'oprd':
            return child    
        
def extract_verb_agnt(t, doc):
    """HELPER"""
    for child in t.children:
        if child.dep_ == 'agent':
            return child 
        
def extract_verbs(t, doc):
    rel=sub=None
    root = next((tok for tok in doc if tok.dep_ == 'ROOT' if tok.pos_ == 'VERB'), None)
    if root:
        prep = extract_verb_preps(root, doc)
        if prep:
            rel = root.text + ' ' + prep.text
            sub = extract_verb_subj(prep, doc)
        else:
            rel = 'TBD'
            sub = root
    elif not root:
        for child in t.children:
            if child.tag_ == 'VBG':
                prep = extract_verb_preps(child, doc)
                dobj = extract_verb_dobj(child, doc)
                oprd = extract_verb_orpd(child, doc)
                agnt = extract_verb_agnt(child, doc)
                if prep:
                    rel = child.text + ' ' + prep.text
                    sub = extract_verb_subj(prep, doc)
                elif dobj:
                    rel = child.text
                    sub = dobj
                elif oprd:
                    rel = child.text
                    sub = oprd
                else:
                    continue
                    print(311111, doc)
            elif child.tag_ == 'VBN':
                prep = extract_verb_preps(child, doc)
                agnt = extract_verb_agnt(child, doc)
                if prep:
                    rel = child.text + ' ' + prep.text
                    sub = extract_verb_subj(prep, doc)
                elif agnt:
                    #print(agnt)
                    rel = child.text + ' ' + agnt.text
                    sub = extract_verb_subj(agnt, doc)                    
            else:
                #print(child)
                #print(56465)
                continue
    
    return rel, sub

       
def create_triple(species, obj, rel, subj):
    if not rel and type(obj) == str and type(subj) == str:
        descriptions[species].append(('species', 'has main parts', obj))
        descriptions[species].append((obj, 'has sub parts', subj))
    elif rel:
        descriptions[species].append((obj, rel, subj))
    else:
        descriptions[species].append((obj, 'TBD', subj))
        
def create_idx_triples(idx, data, species):
    descriptions_id[(species, idx)] = data[species]
        
        
for species in tqdm_notebook(list(data_unsorted.keys())[0:1]):
    for idx, text in enumerate(data_unsorted[species][0:1]):
        # Clean the text
        text = re.sub(r'(?<!\d)\.(?!\d)', ' ', text)
        text = f'{text.strip()}.'
        descriptions = collections.defaultdict(list)

        # Reset variables
        part=trait=rel=subj=conjs=subj_adj = None 

        doc = nlp(text)
        # Loop over tokens
        for t in doc:
            if t.dep_ == 'compound':
                continue
            ### SUBJECTS ###    
            elif t.dep_ == 'nsubj' and t.pos_ == 'NOUN':
                # Check existance of parts
                part = check_existance(t, doc)
                if part:
                    # Reconstruct Compounds & Append
                    trait = compound_reconstructor(t, doc)
                    create_triple(species, part, None, trait)
                    # Extract Trait adjectives
                    trait_adjs = extract_adjectives(t, doc)
                    for adj in trait_adjs:
                        create_triple(species, trait, None, adj)
                    # Extract VERBS 
                    rel, obj = extract_verbs(t, doc)
                    create_triple(species, trait, rel, obj)
                    # Subject Adjectives
                    if obj:
                        object_adjs = extract_adjectives(obj, doc)
                        for adj in object_adjs: 
                            create_triple(species, obj, None, adj)


            ### ROOTS ###
            elif t.dep_ == 'ROOT' and t.pos_ == 'NOUN':
                part = check_existance(t, doc)
                if part:
                    # Reconstruct Compounds & Append
                    trait = compound_reconstructor(t, doc)
                    create_triple(species, part, None, trait)
                    # Extract Trait adjectives
                    trait_adjs = extract_adjectives(t, doc)
                    for adj in trait_adjs:
                        create_triple(species, trait, None, adj)
                    # Extract appositional modifier
                    appos_l = extract_appos_mods(t, doc)
                    for subj in appos_l:
                        create_triple(species, trait, None, subj)
                    # Exract VERBS
                    rel, obj = extract_verbs(t, doc)
                    create_triple(species, trait, rel, obj)
                    # Subject Adjectives
                    if obj:
                        # Adjectives
                        object_adjs = extract_adjectives(obj, doc)
                        for adj in object_adjs: 
                            create_triple(species, obj, None, adj)
                        # VERBS
                        rel_v, obj_v = extract_verbs(obj, doc)
                        create_triple(species, obj, rel_v, obj_v)
                        
        create_idx_triples(idx, descriptions, species)
        
        #print(doc)
        #for tr in descriptions[species]:
        #    print(tr)
        #print('\n')

  0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
#descriptions_id

In [22]:
RDF_dict = collections.defaultdict(list)
RDF_dict_text = collections.defaultdict(list)

def relations(t):
    
    distance = ['mm', 'cm', 'm',
                'milimeter', 'centimeter', 'meter']
    rel = 'is'
    if t.pos_ == 'NOUN':
        rel = 'has'
    if t.lemma_ in distance:
        rel = 'is'
    return rel

def subjects(t):
    #print(t, t.pos_)
    sub = t.lemma_
    if t.pos_ in ['VERB', 'NOUN']:
        sub = t.text
    return sub


for (species, idx) in tqdm_notebook(descriptions_id.keys()):
    data = descriptions_id[(species, idx)]
    for (obj, rel, sub) in data:
        # Aleady clean
        if type(obj) == str and type(rel) == str and type(sub) == str:
            RDF_dict[species].append((obj, rel, sub))
        # TBD
        elif type(rel) == str and rel == 'TBD' and sub:
            # Get length
            length = len(sub.text.split())
            # If singular
            if length == 1:
                # Catch spans
                try:
                    RDF_dict[species].append((obj, relations(sub), subjects(sub)))
                except:
                    RDF_dict[species].append((obj, relations(sub[0]), subjects(sub[0])))
            else:
                if ',' in sub.text:
                    spans = sub.text.split(',')
                    for span in spans:
                        # Catch empty strings
                        if span:
                            doc = nlp(span.strip())
                            RDF_dict[species].append((obj, relations(doc[0]), doc.text))
                else:
                    RDF_dict[species].append((obj, relations(sub[0]), sub.text))

        elif type(rel) != str and rel:
            length = len(sub.text.split())
            # If singular
            if length == 1:
                # Catch spans
                try:
                    RDF_dict[species].append((obj, rel.text, subjects(sub)))
                except:
                    RDF_dict[species].append((obj, rel.text, subjects(sub[0])))
            else:
                print( length, obj, sub, rel)
        elif not sub:
            continue
        else:
            try:
                RDF_dict[species].append((obj.text, rel, subjects(sub)))
            except:
                RDF_dict[species].append((obj, rel, subjects(sub)))
            #print(f'{sub} --- {rel} --- {obj}')

        
for species in RDF_dict.keys():
    RDF_dict[species] = list(set(RDF_dict[species]))

for species in RDF_dict.keys():
    temp = []
    for (subj, rel, obj) in RDF_dict[species]:

        if type(subj) != str:
            subj = subj.text
        if type(rel) != str:
            rel = rel.text
        if type(obj) != str:
            obj = obj.text
        temp.append((subj, rel, obj))
        RDF_dict_text[species].append(f'{subj} {rel} {obj}.'.capitalize())
    RDF_dict[species] = temp


  0%|          | 0/75668 [00:00<?, ?it/s]

In [23]:
with open('../data/processed/RDF_text_plants_2000.pkl', 'wb') as f:
    pickle.dump(RDF_dict_text, f)   

In [None]:
RDF_dict_text = collections.defaultdict(list)


source   = []
relation = []
target   = []

for species in list(RDF_dict.keys())[0:5]:
    for (_1, _2, _3) in RDF_dict[species]:
        
        if type(_1) != str:
            _1 = _1.text
        if type(_2) != str:
            _2 = _2.text
        if type(_3) != str:
            _3 = _4.text
            
            
        source.append(_1)
        relation.append(_2)
        target.append(_3)
    
kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relation})

In [None]:
data = pickle.load(open('../data/description/04_TRAIN_0000000-0014557_PLANTS.pkl', 'rb'))
species = list(data.keys())
baseparts = list(glossary.keys())

nodes = [(source, target) for source, target in zip(kg_df['source'].values, kg_df['target'].values)]
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.Graph())

nodes

In [None]:
plt.figure(figsize=(20, 20))
Graph(nodes, )

In [None]:
data = pickle.load(open('../data/description/04_TRAIN_0000000-0014557_PLANTS.pkl', 'rb'))
species = list(data.keys())
baseparts = list(glossary.keys())

nodes = [(source, target) for source, target in zip(kg_df['source'].values, kg_df['target'].values)]
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.Graph())
node_labels = {node : node for idx, node in enumerate(G)}
node_size = {}
node_color = {}
node_labels_large = {}

edge_labels = dict(zip(list(zip(kg_df.source, kg_df.target)),
                  kg_df['edge'].tolist()))

for node in node_labels:
    if node in species:
        node_size[node] = 2.2
        node_color[node] = 'blue'
        node_labels_large[node] = node
    elif node in baseparts:
        node_size[node] = 2
        node_color[node] = 'green'
        node_labels_large[node] = node
    elif node in parts:
        node_size[node] = 1.5
        node_color[node] = 'black'
        node_labels_large[node] = ''
    else:
        node_size[node] = .8
        node_color[node] = 'white'
        node_labels_large[node] = ''



In [None]:
plt.figure(figsize=(20, 20))
Graph(nodes, node_layout='spring', edge_layout='straight', arrows=True,
      node_labels=node_labels, node_label_offset=(0.00, -0.035), node_size=node_size, 
      node_label_fontdict=dict(size=8, rotation=30, ha='right'), node_color=node_color,
      edge_labels=edge_labels, edge_width=0.4, edge_label_fontdict=dict(size=7))

#plt.savefig('plot.pdf', format='pdf', dpi=1200, bbox_inches='tight')


In [None]:
node_size

In [None]:
import random
import string
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
# create random graph with hubs and leafs
hubs = np.random.randint(5, 15, size=10)
leafs = np.ones((np.sum(hubs)), dtype=int)
degrees = np.concatenate([hubs, leafs])
g = nx.configuration_model(degrees, create_using=nx.Graph)
giant_component = next(nx.connected_components(g))
h = g.subgraph(giant_component)

# generate random labels
def random_string(length):
    # https://stackoverflow.com/a/2030081/2912349
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for ii in range(length))

labels = {node : random_string(5) for node in h}

# generate node sizes
node_size = dict(g.degree)
nx_node_size = np.array([100*node_size[node] for node in h])
ng_node_size = {node : np.sqrt(size) for node, size in node_size.items()}

# same positions for comparability
node_layout = nx.spring_layout(h)

# plot
fig, ax = plt.subplots(figsize=(20, 12))


Graph(h, node_layout=node_layout, node_label_offset=0.1, node_size=ng_node_size, ax=ax, arrows=True)
plt.show()