In [1]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import requests
import random
import pickle
import re

import spacy
import json
from spacy import displacy
import collections
from collections import Counter
from collections import OrderedDict
from tqdm.notebook import tqdm as tqdm_notebook
from itertools import islice

nlp = spacy.load('en_core_web_trf')

In [2]:
items = pickle.load(open('../../data/glossaries/birds.pkl', 'rb'))

In [3]:
items += [
    'underpart', 
    'coloration'
]

In [4]:
glossary = {'Main': items}

In [5]:
compound_list = [
    'fertile', 'sterile',
    'male', 'female', 'bisexual', 'hermaphroditic', 
    'basal', 'developed', 
    'primary', 'secondary', 'main',
    'upper', 'lower', 'greater', 'dorsal', 'alternate', 'lesser', 'apex', 'outer',
    'central', 'outermost', 'outer', 'inner', 'uppermost', 'median', 'dorsal', 'central', 'lateral',
    'young', 'mature', 'individual', 
    'opposite', 'single', 'paired', 'malar'
]

rubbish_list = [
    '.', ',', '-', '..', '...', '', 
]


In [6]:
#data = pickle.load(open('../../data/PlantNet/descriptions_raw.pkl', 'rb'))
data = pickle.load(open('../../data/description/04_TRAIN_0000000-0002000_BIRDS.pkl', 'rb'))


data['Cape Starling'] = [
    "Fairly large, short-tailed glossy starling with rather uniform appearance.",
    "Head is glossy blue, blacker on ear-coverts.",
    "upperparts blue-green with strong iridescence",
    "wing blue-green, dark blue spots at tips of some median and greater coverts.",
    "distinct bronzy-purple epaulet, primaries P6-P9 strongly indented on inner webs",
    "tail glossy blue-green."
    "throat and upper breast have blue iridescence."
    "lower breast, belly and undertail-coverts with greener gloss."
    "iris bright orange-yellow.",
    "bill and legs black.", 
    "Sexes alike.",
    "Juvenile is dull-plumaged, with matt black underparts, iris initially grey.",
    "at three months iris dull yellow, acquiring adult colour after six months",
]

In [7]:
# Open the text file
location = "../../data/external/birds_matched.txt"
with open(location) as f:
    lines = f.readlines()

BOW_list = []
for line in lines[1:]:
    line = line.strip()
    _, _, BOW = line.split(';')
    BOW_list.append(BOW)  

In [8]:
measurements_list = [
    'mm', 'cm', 'm', 'km',
    'milimeter', 'centimeter', 'meter', 'kilometer',
    'milimetre', 'centimetre', 'metre', 'kilometre',
    'inch', 'foot', 'yard', 'mile',
    'wide', 'long', 'broad', 'tall',
    'length', 'form',
]

In [14]:
def text_preparation(species, text):
    cleaners = [(r'(?<!\d)\.(?!\d)', ' '),
                (r'\s×\s', ' times '),
                (r'\s+c\s+', ' '),
                (r'â\x80\x93', ' to '),
                (r'\xa0', ' '),
                (r'\x97', ''),
                (r'\s{2,}', ' '),
                (r'(\D)(\.)', r'\1 '),
                (r'(\d)(\.)(\D)', r'\1 \3'),
                (r'(long,)', r'long and'),
                (r'(wide,)', r'wide and'),
               ]
    
    species_parts = species.split()
    candidates = [' '.join(species_parts[:idx+1]) for idx, _ in enumerate(species_parts)]
    #candidates += [
    #        f'{species_parts[0][0]}. {species_parts[1]}'
    #    ]
    candidates.reverse()
    for candidate in candidates:
        try:
            text = re.sub(candidate, 'the species', text)
        except:
            continue # Skip species with brackets for now
    for (cleaner, replacement) in cleaners:
        text = re.sub(cleaner, replacement, text)    
    text = f'{text.strip()}.'
    return text.capitalize()


def dict_sentence(t):
    sentence_dict = {}
    for child in t.children:
        # Exceptions
        if (
            child.dep_ not in [
                'det', 'cc', 'punct', 
                'poss', 
                'nmod', # Gives to many errors
            ]
            and child.pos_ not in [
                'DET', 'PUNCT',
                'PART'
            ]
        ):
            items = dict_sentence(child)
            sentence_dict[child] = items 
    return sentence_dict

def undict_to_tuples(d, acc = []):
    if d == {}:
        yield acc
    else:
        for k, v in d.items():
            yield from undict_to_tuples(v, acc + [k,])
            
def undict_to_pairs(d):
    for k,v in d.items():
        for subk in v:
            yield (k, subk)
        yield from undict_to_pairs(v)

def dict_sentence_parent(t):
    if t.dep_ in [
        'nsubj', 'nsubjpass', 'relcl',
    ]:
        parent = next(tok for tok in t.ancestors)
        parent_dict = dict_sentence(parent)
        del parent_dict[t]
        return parent_dict

def update_nested_dict(main_dict, new_dict):
    for name, rc_dict in new_dict.items():
        main_dict.setdefault(name, {}).update(rc_dict)
    return main_dict       

def extract_compounds(t, doc):
    head = None
    if t.dep_ == 'compound':
        t = next(t.ancestors)
    indices = [child.i for child in t.children
               if child.dep_ == 'compound'
               or child.lemma_ in compound_list
               and child.i < t.i]
    indices.append(t.i)
    indices.sort(reverse=True)
    compounds  = []
    for idx in indices:
        compounds.append(doc[idx : t.i + 1])
    return compounds    
            
            
def check_existance(t):
    item = None
    for mainpart in glossary.keys():
        if t.lemma_ in compound_list:
            item = None
        #elif t.pos_ != 'NOUN':
        elif t.pos_ not in ['NOUN', 'PROPN']:
            item = None
        elif t.lemma_.lower().strip() in glossary[mainpart]:
            item = mainpart            
    return item


def clean_compounds(item_list, doc):
    new_item_list = []
    new_item_list.append(item_list[0])
    for item in item_list[1:]:
        if item.dep_ == 'prep':
            new_item_list += item_list[1:]
            break
        if (
            item.lemma_ in compound_list 
            or item.dep_ == 'compound'
        ):
            continue
        elif item.pos_ == 'NOUN':
            compound = extract_compounds(item, doc)[-1]
            if len(compound) == 1:
                compound = compound.root
            new_item_list.append(compound)
        else:
            new_item_list.append(item)
    return new_item_list

def clean_measurements(info_list):
    new_item_list = []
    for item_list in info_list:
        
        nums = [t.dep_ for t in item_list if type(t) == spacy.tokens.token.Token if t.pos_ == 'NUM']
        if len(nums) > 1:
            temp = []
            for item, future in zip(item_list, item_list[1:]):        
                if type(item) == spacy.tokens.span.Span:
                    temp.append(item)
                elif item.pos_ == 'NUM' and future.pos_ == 'NUM':
                    new_item_list.append(temp + [item])
                    new_item_list.append(temp + [future])
                else:
                    temp.append(item)
        else:
            new_item_list.append(item_list)
    return new_item_list
        
    
def clean_conjunctions(info_list):
    new_info_lists = []
    for info in info_list:
        ccs = [t for t in info[1:] 
               if t.dep_ == 'conj'
               and t.pos_ != 'NOUN'] # Leave here?
        #print(ccs)
        if ccs:
            used = []
            for cc in ccs:
                used.append(cc)
                t = cc
                while t.dep_ == 'conj':
                    t = t.head
                try:
                    idx = info.index(t, 1)
                except: # When Idx == 0
                    idx = 0
                new_info_lists.append(info[ : idx] + [cc])
            used = set(used)
            left = [t for t in info if t not in used]
            new_info_lists.append(left)
        else:
            new_info_lists.append(info)
    return new_info_lists
        
    

def create_relation(item_list):
    
    subjects = item_list[:-1]
    objects  = item_list[1:]
    relation = None
    triples = []
    
    for sub, obj in zip(subjects, objects):
        relation = 'temp'
        triples.append((sub, relation, obj))

    return triples


def AIKE(info):
    aikes_list = []
    try:
        if len(info) <= 1:
            pass
        else:
            try:
                second_pos = info[1].pos_
                second_dep = info[1].dep_
            except:
                second_pos = info[1].root.pos_
                second_dep = info[1].root.dep_
            if second_pos == 'ADJ':
                aikes_list.append(f'{info[0].lemma_} {info[1].lemma_}')
            elif second_pos == 'NOUN':
                aikes_list.append(f'{info[0].lemma_} has {info[1].lemma_}')
            elif second_dep == 'prep':
                aikes_list.append(f'{info[0].lemma_} {info[1].lemma_} {info[2].lemma_}')
            elif second_pos == 'VERB':
                aikes_list.append(f'{info[0].lemma_} {info[1]} {info[2].lemma_}')
            else:
                pass
    except:
        pass
    return aikes_list
    

def extract_triples(doc):
    
    
    # Speed up the extraction
    previous = []
    AIKE_list = []
    triples = []
    for t in doc:
        part = check_existance(t)
        #print(part, t)
        if part:
            # Create temp list for storing compounds
            compounds_temp = []
            compounds_temp.append(('species', 'has_main_part', part.capitalize()))
            
            compound = part.capitalize()
            for new_compound in extract_compounds(t, doc):
                if type(compound) == str:
                    compounds_temp.append((compound, 'has_sub_part', new_compound.lemma_))
                else:
                    compounds_temp.append((compound.lemma_, 'has_sub_part', new_compound.lemma_))
                compound = new_compound
            # Reset T
            t = compound.root
            # Get child dict
            child_dict = {compound: dict_sentence(t)}
            # Get parent dict
            parent_dict = {compound: dict_sentence_parent(t)}
            # Update if exists
            if parent_dict[compound]:
                # Add dicts together
                sentence_dict = update_nested_dict(child_dict, parent_dict)
                #print(sentence_dict)
            else:
                sentence_dict = child_dict
            # List dict into tuples
            info_lists = list(undict_to_tuples(sentence_dict))
            #print('start', t, info_lists)
            info_lists = clean_conjunctions(info_lists)
            info_lists = clean_measurements(info_lists)

            #print(t, info_lists)
            for info in info_lists:
                # Skip no info
                if len(info) == 1:
                    continue
                info = clean_compounds(info, doc)
                triples.extend(compounds_temp)
                AIKE_list.extend(AIKE(info))
                triples.extend(create_relation(info))
            
    return list(set(AIKE_list))
    return list(dict.fromkeys(triples))

In [15]:
species_list = BOW_list
species = species_list[0:]

In [16]:
aike_dict = collections.defaultdict(list)

for bird in tqdm_notebook(species):
    kn_data = []
    for text in data[bird][0:]:
        text = text_preparation(bird, text)
        doc = nlp(text)
        #if len(doc) > 20:
        #    continue
        kn_data.extend(extract_triples(doc))
    aike_dict[bird] = kn_data

  0%|          | 0/200 [00:00<?, ?it/s]

In [17]:
aike_dict

defaultdict(list,
            {'House Sparrow': ['throat grayish',
              'throat whitish',
              'tarsus has mass',
              'tail has tarsus',
              'wing has tail',
              'plumage marking black',
              'wing bar white',
              'plumage marking white',
              'underpart has belly',
              'flank has foreneck',
              'undertail covert has chest',
              'belly has undertail covert',
              'breast has upper front part',
              'crown has crown',
              'plumage has breeding',
              'crown gray',
              'crown has nape',
              'wing has crown',
              'rump has hindneck',
              'back has rump',
              'winter plumage new',
              'beak short',
              'beak thick',
              'dimorphism sexual',
              'beak has leg',
              'beak gray',
              'beak of sex',
              'flange red',
              'fla

# VIZ

In [None]:
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
from netgraph import Graph
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [None]:
#descriptions

In [None]:
source   = []
relation = []
target   = []


for (sub, rel, obj) in kn_data:
    if type(sub) != str:
        sub = sub.lemma_
    if type(obj) != str:
        obj = obj.lemma_
    if sub ==  obj:
        continue
    source.append(sub)
    relation.append(rel)
    target.append(obj)

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relation})

In [None]:
#kn_data

In [None]:
nodes = [(source, target) for source, target in zip(kg_df['source'].values, kg_df['target'].values)]
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.Graph())


node_labels = {node : node for idx, node in enumerate(G)}
edge_labels = dict(zip(list(zip(kg_df.source, kg_df.target)),
                  kg_df['edge'].tolist()))

node_size = {}
node_color = {}

size = 1.5

for node in node_labels:
    if node == 'species':
        node_size[node] = 3.5/size
        node_color[node] = 'darkgreen'
    elif node[0].isupper():
        node_size[node] = 2/size
        node_color[node] = 'white'
    else:
        node_size[node] = 1./size
        node_color[node] = 'white'
        
pos = nx.spring_layout(G, k = 0.08, iterations=5000, seed=3, scale=0.3, center=(0,0), dim=2)

In [None]:
fig, ax = plt.subplots(figsize=(25, 25))
Graph(nodes, 
      #node_layout='spring', edge_layout='curved', 
      #node_layout=pos, edge_layout='straight', 
      node_layout='spring', edge_layout='straight',
      arrows=True, node_zorder=3, #edge_zorder=1,
      node_labels=node_labels, 
      node_label_offset=0.02, 
      #edge_labels=edge_labels,
      node_label_fontdict=dict(size=18, rotation=0, ha='center', clip_on=False), node_edge_width=0.2,
      node_size=node_size,  node_color=node_color, #edge_labels=edge_labels,
      edge_width=0.2, edge_label_fontdict=dict(size=10,),
      #node_layout_kwargs=dict(node_size=1, total_iterations=20),
      ax=ax)

In [None]:
doc = nlp('legs')

In [None]:
displacy.render(doc)

In [None]:
len(doc)

In [None]:
doc[0].pos_