In [1]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import requests
import random
import pickle
import re
import spacy
from spacy import displacy
import collections
from collections import Counter
from tqdm.notebook import tqdm as tqdm_notebook


nlp = spacy.load('en_core_web_trf')

In [2]:
glossary = pickle.load(open('../../data/glossaries/FNA_glossary.pkl', 'rb'))
glossary['leaf'] += ['leave']

In [3]:
#glossary_FNA['Flower']

In [4]:
compound_list = [
    'fertile', 'sterile',
    'male', 'female', 'bisexual', 'hermaphroditic', 
    'basal', 'developed', 
    'primary', 'secondary', 'main',
    'upper', 'lower', 'greater', 'dorsal', 'alternate', 'lesser', 'apex', 'outer',
    'central', 'outermost', 'outer', 'inner', 'uppermost', 'median', 'dorsal', 'central', 'lateral',
    'young', 'mature', 'individual', 
    'opposite', 
]

rubbish_list = [
    '.', ',', '-', '..', '...', '', 
]


In [5]:
#data = pickle.load(open('../../data/PlantNet/descriptions_raw.pkl', 'rb'))
data = pickle.load(open('../../data/description/04_TRAIN_0000000-0014557_PLANTS.pkl', 'rb'))

In [6]:
measurements_list = [
    'mm', 'cm', 'm', 'km',
    'milimeter', 'centimeter', 'meter', 'kilometer',
    'milimetre', 'centimetre', 'metre', 'kilometre',
    'inch', 'foot', 'yard', 'mile',
    'wide', 'long', 'broad', 'tall',
    'length', 'form',
]

In [7]:
data = {
    'Test Tree':[
        'light, slightly dark, upper axial leaves with 7 veins.',
        'Bark up to 20 to 30 cm.',
        'Stems often 5-veined.',
        'Branches light-dark reddish brown and greenish purple or black.'
    ]
}

In [None]:
def text_preparation(species, text):
    cleaners = [(r'(?<!\d)\.(?!\d)', ' '),
                (r'\s×\s', ' times '),
                #(r'\xa0', ' '),
                (r'\s+c\s+', ' '),
                (r'â\x80\x93', ' to '),
                (r'\xa0', ' '),
                (r'\x97', ''),
                (r'\s{2,}', ' '),
                (r'\.', ' ')]
    
    species_parts = species.split()
    candidates = [' '.join(species_parts[:idx+1]) for idx, _ in enumerate(species_parts)]
    candidates += [
        f'{species_parts[0][0]}. {species_parts[1]}'
    ]
    candidates.reverse()
    for candidate in candidates:
        try:
            text = re.sub(candidate, 'the species', text)
        except:
            continue # Skip species with brackets for now
    for (cleaner, replacement) in cleaners:
        text = re.sub(cleaner, replacement, text)    
    text = f'{text.strip()}.'
    return text.capitalize()

def is_float(val):
    try:
        num = float(val)
    except ValueError:
        return False
    return True

def check_existance(t, doc):
    item = None
    for mainpart in glossary.keys():
        if t.lemma_ in compound_list:
            item = None
        elif t.dep_ == 'compound':
            item = None
        elif t.pos_ != 'NOUN':
            item = None
        elif not list(t.children):
            item = None
        elif t.lemma_.lower().strip() in glossary[mainpart]:
            item = mainpart            
    return item


def extract_compounds(t, doc):
    for child in t.children:
        if (
            child.dep_ == 'compound'
            or child.lemma_ in compound_list
        ):
            yield child
            #yield from (extract_compounds(child, doc))

def extract_amods(t, doc):
    for child in t.children:
        if (
            child.dep_ == 'amod'
            and child.lemma_ not in measurements_list
            and child.lemma_ not in compound_list
            or child.dep_ == 'conj'
            #and child.pos_ == ' ADJ'
        ):
            yield child
            yield from (extract_amods(child, doc))
            #yield from (extract_numbers(child, doc))

def extract_advmods(t, doc):
    for child in t.children:
        if (
            child.dep_ == 'advmod'
            and child.lemma_ not in measurements_list
            and child.lemma_ not in compound_list
        ):
            yield child
            #yield from (extract_advmods(child, doc))            

def extract_numbers(t, doc):
    for child in t.children:
        if (
            child.dep_ == 'nummod'
            or child.dep_ == 'quantmod'
            and child.pos_ == 'NUM'
        ):
            yield child
            yield from (extract_numbers(child, doc))

def extract_apmods(t, doc):
    for child in t.children:
        if (
            child.dep_ == 'appos'
            and child.lemma_ not in measurements_list
            and child.lemma_ not in compound_list
            or child.dep_ == 'conj'
            #and child.pos_ == ' ADJ'
        ):
            yield child
            yield from (extract_numbers(child, doc))
            
def clean_triples(triples):
    
    cleaned = []
    for (sub, rel, obj) in triples:
             
        sub = sub.strip().lower().strip('-')
        rel = rel.strip().lower().strip('-')
        obj = obj.strip().lower().strip('-')
        cleaned.append((sub, rel, obj))
    
    return cleaned
            
            
def extract_triples(doc):
    
    triples = []
    
    for t in doc:
        part = check_existance(t, doc)
        if part:
            
            triples.append(('species', 'has_part', part))
            parts = []
            # Append to list
            parts.append((part, 'has_main_part', t.lemma_))
            # Yield Compounds
            for compound in extract_compounds(t, doc):
                # Get last item
                last = parts[-1][2]
                # Construct new
                new = f'{compound} {last}'
                # Append
                parts.append((last, 'has_sub_part', new))    
            triples.extend(parts)
                
            # Modifiers
            for amod in extract_amods(t, doc):
                triples.append((parts[-1][-1], 'has_property', amod.lemma_))
                for advmod in extract_advmods(amod, doc):
                    triples.append((amod.lemma_, 'has_modifier', advmod.lemma_))
            
            # Direct Numbers        
            numbers = list(extract_numbers(t, doc))
            #print(list(numbers))
            if numbers:
                print(numbers)
                new_node = f'{t.lemma_}_quantity'
                triples.append((parts[-1][-1], 'has_property', new_node))
                for number in numbers:
                    triples.append((new_node, 'has_property', number.lemma_))
                    
            # Appositional modifier
            for apmod in extract_apmods(t, doc):
                triples.append((parts[-1][-1], 'has_property', apmod.lemma_))
            #print(list(extract_apmods(t, doc)))

    return clean_triples(triples)
            
    

In [None]:
species_list = list(data.keys())

In [None]:
kn_data = []
species = species_list[0]
for sent in data[species][0:10]:
    
    text = text_preparation(species, sent)
    
    doc = nlp(sent)
    kn_data += extract_triples(doc)

In [None]:
displacy.render(doc)

In [8]:
data = pickle.load(open('../../data/description/04_TRAIN_0000000-0014557_PLANTS.pkl', 'rb'))

In [9]:
species_list = list(data.keys())
species = species_list[0]


In [84]:
doc = nlp(data[species][3])
displacy.render(doc)

In [85]:
def dict_sentence(t):
    sentence_dict = {}
    for child in t.children:
        items = dict_sentence(child)
        sentence_dict[child] = items 
    return sentence_dict

def undict_to_tuples(d, acc = ()):
    if d == {}:
        yield acc
    else:
        for k,v in d.items():
            yield from undict_to_tuples(v, acc + (k,))
    
def extract_compounds(t, doc):
    head = None
    if t.dep_ == 'compound':
        t = next(t.ancestors)
    indices = [child.i for child in t.children
               if child.dep_ == 'compound'
               or child.lemma_ in compound_list]
    indices.append(t.i)
    indices.sort(reverse=True)
    compounds  = []
    for idx in indices:
        compounds.append(doc[idx : t.i + 1])
    return compounds    
            
            
def check_existance(t):
    item = None
    for mainpart in glossary.keys():
        if t.lemma_ in compound_list:
            item = None
        elif t.pos_ != 'NOUN':
            item = None
        elif t.lemma_.lower().strip() in glossary[mainpart]:
            item = mainpart            
    return item

def extract_information(t, double):
    triples = []
    for info in double:
        if info.dep_ == 'amod':
            triples.append((t.lemma_, 'has_property', info.lemma_))
        elif info.pos_ == 'NUM':
            triples.append((t.lemma_, 'has_number', info.lemma_))
        elif t.dep_ == 'amod':
            triples.append((t.lemma_, 'has_modifier', info.lemma_))
        else:
            triples.append((t.lemma_, 'other', info.lemma_))
        t = info
    return triples

def extract_triples(doc):
    
    triples = []
    for t in doc:
        part = check_existance(t)
        #print(part, t)
        if part:
            triples.append(('species', 'has_main_part', part))
            last = part
            for compound in extract_compounds(t, doc):
                triples.append((last, 'has_sub_part', compound))
                last = compound
            sentence_dict = dict_sentence(t)
            for tuples in undict_to_tuples(sentence_dict):
                pass
                #print(extract_information(t, tuples))
    print(triples)

In [86]:
extract_triples(doc)

[('species', 'has_main_part', 'spikelet'), ('spikelet', 'has_sub_part', spikelets), (spikelets, 'has_sub_part', Fertile spikelets)]


# VIZ

In [None]:
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
from netgraph import Graph
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [None]:
#descriptions

In [None]:
source   = []
relation = []
target   = []


for (sub, rel, obj) in kn_data:
    source.append(sub)
    relation.append(rel)
    target.append(obj)

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relation})

In [None]:
kn_data

In [None]:
nodes = [(source, target) for source, target in zip(kg_df['source'].values, kg_df['target'].values)]
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.Graph())


node_labels = {node : node for idx, node in enumerate(G)}
edge_labels = dict(zip(list(zip(kg_df.source, kg_df.target)),
                  kg_df['edge'].tolist()))

node_size = {}
node_color = {}

size = 1.5

for node in node_labels:
    if node == 'species':
        node_size[node] = 3.5/size
        node_color[node] = 'darkgreen'
    else:
        node_size[node] = 1./size
        node_color[node] = 'white'
        
pos = nx.spring_layout(G, k = 0.08, iterations=5000, seed=3, scale=0.5, center=(0,0), dim=2)

In [None]:
fig, ax = plt.subplots(figsize=(25, 25))
Graph(nodes, 
      #node_layout='spring', edge_layout='curved', 
      node_layout=pos, edge_layout='straight', 
      arrows=True, node_zorder=3, #edge_zorder=1,
      node_labels=node_labels, 
      node_label_offset=0.02, 
      #edge_labels=edge_labels,
      node_label_fontdict=dict(size=18, rotation=0, ha='center', clip_on=False), node_edge_width=0.2,
      node_size=node_size,  node_color=node_color, #edge_labels=edge_labels,
      edge_width=0.2, edge_label_fontdict=dict(size=10,),
      #node_layout_kwargs=dict(node_size=1, total_iterations=20),
      ax=ax)

In [None]:
def retokenize_doubles(candidates, doc):
    
    retok_list = []
    # Create list to change stuff in place
    candidate_list = [list(item) for item in candidates]
    # Get Subject
    subject = candidate_list[0][1]
    
    
    print(candidate_list)
    
    ## Compounds
    # Find possible compounds
    compound_candidates = [item for item in candidate_list[1:] if item[0] == subject or item[0].dep_ == 'compound']
    
    #print(compound_candidates)
    
    # Extract compounds:
    compounds = [amod for (_, amod) in compound_candidates 
                 if amod.lemma_ in compound_list or amod.dep_ == 'compound']
    
    #print(compounds)
    #for compound in compounds:
        #print(compound)
        #compounds += [additional[1] for additional in candidate_list[1:] if additional[0] ==  compound]
    
    if compounds:
        compound_idx = min([compound.i for compound in compounds if compound.i < subject.i])
        compound = doc[compound_idx : subject.i + 1]
        # Replace in place (not faster but more convinient)
        for idx, (noun, info) in enumerate(candidate_list[1:]):
            if noun == subject:
                candidate_list[idx + 1] = [compound, info]
        # Remove compound amods:
        compounds_removed = [[information[0], information[1]] for information in candidate_list[1:]
                             if information[1].text not in information[0].text]
        candidate_list = candidate_list[0:1] + compounds_removed
        for idx, token in enumerate(reversed(compounds)):
            compound = doc[token.i : subject.i + 1]
            #print(compound, idx)
            if idx == 0:
                candidate_list.insert(idx + 1, [subject, compound])
            else:
                candidate_list.insert(idx + 1, [previous_compound, compound])
            previous_compound = compound
        
    
    #print(compounds)
    return candidate_list
    
