In [2]:
import numpy as np
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import re
import collections
import glob
from itertools import islice

import sys
import time
sys.path.insert(0, '../../src/models/')
sys.path.insert(0, '../../src/features/')

from build_features import similarity_matrix, name_cleaner, yield_chunks
from transformers import DistilBertTokenizer, DistilBertModel

%matplotlib inline

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Load data

In [5]:
root = "../../data/processed/"
datadict = {}
# Load the pickle list
data_files = glob.glob(root+ 'descriptions*PLANTS.pkl')
for data_file in data_files:
    # Open the dict and update
    datadict.update(pickle.load(open(data_file, 'rb')))

In [19]:
#datadict['Adansonia grandidieri']

In [7]:
import spacy
import re
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.util import filter_spans
from pathlib import Path
nlp = spacy.load("en_core_web_trf")


In [20]:
sents = [sent for (sent, _) in datadict['Adansonia grandidieri']]

In [21]:
sents

['The branches are regularly distributed, and mainly horizontal.',
 'Large, oblong-ovoid to almost globose, with fragile, 2.5-4 mm thick wall, reddish brown hairy, many-seeded.',
 'Arranged spirally, palmately compound, with 9-11 leaflets, stipules up to 2 mm long, caducous, petiole 5-13 cm long, pubescent, petiolules 1-5 mm long, leaflets narrowly ellip-tical to lanceolate, medial ones 6-12 long 1.5-3 cm wide, margin entire, bluish green, densely hairy with short, clumped, yellowish hairs.',
 'The tree develops a reddish-grey coloured and rather slender, bottle shaped trunk with a wide cantilevered crown.',
 'Adansonia grandidieri is a deciduous, medium-sized, unarmed tree and may reach 30m in height.',
 'The crown is flat-topped.',
 'Flowers are solitary and produced in leaf axils at the tips of leafless branches, they are bisexual, regular, 5-merous, large, showy and fragrant, flower bud erect, ovoid, dark brown, pedicel up to 1.5 cm long and 1 cm in diameter, dark brown hairy, join

In [99]:
def yield_chunks(sentence):
    
    chunks  = []
    VERB    = ''
    SUBJECT = ''
    
    sentence_chunks = re.split(',|and ', sentence)
    
    for chunk in sentence_chunks:
        doc = nlp(chunk)
        for chunk in doc.noun_chunks:
            if chunk.root.head.pos_ in ['VERB', 'AUX']:
                VERB = chunk.root.head
                chunks.append((f'{doc.text}.'))
            elif VERB:
                print(chunk)

    # Split some rubbish
    chunks = [re.sub(' +', ' ', chunk) for chunk in chunks]
    chunks = [re.sub('\.+', '.', chunk).strip().capitalize() for chunk in chunks]

    return chunks


In [100]:
#cleaned = [yield_subjects(sent) for sent in tqdm(sents)]

In [101]:
yield_chunks(sents[0])

['The branches are regularly distributed.']

In [54]:
displacy.render(nlp(sents[-1]))

In [None]:
''' 
def yield_subjects(sentence):
    chunks = []

    doc = nlp(sentence)
    # Find Normal subject
    n_subject = [token for token in doc if token.dep_ == 'nsubj' 
                                        or token.pos_ == 'PROPN'
                                        or token.pos_ == 'NOUN' and token.dep_ == 'ROOT']


    #sentence_split = sentence.split(',')
    
    sentence_split = re.split(',|and ', sentence)

    
    # If no subject is found
    if n_subject == []:
        for sent in sentence_split:
            sent = nlp(sent)
            # Try to find new normal subject
            n_subject = [token for token in sent if token.dep_ == 'nsubj' or token.pos_ == 'PROPN']
            #print(n_subject)
            # If still no normal subject use species
            if n_subject == []:

                # Find Root
                ROOT = [token for token in sent if token.dep_ == 'ROOT']
                #print(ROOT[0], ROOT[0].pos_, ROOT[0].dep_)
                if ROOT[0].pos_ == 'NOUN':
                    # If NOUN 'have'
                    chunks.append(f'{sent.text}.') ## Species
                else:
                    chunks.append(f'{sent.text}.') ## Species
            else:
                chunks.append(f'{sent}.')
    elif len(n_subject) == 1:
        for sent in sentence_split:
            sent = nlp(sent)
            if n_subject[0].text in sent.text:
                chunks.append(f'{sent}.')
            else:
                # Find Root
                ROOT = [token for token in sent if token.dep_ == 'ROOT']
                if ROOT[0].pos_ == 'NOUN':
                    # Replace determiners or pronouns
                    if n_subject[0].pos_ == 'DET':
                        chunks.append(f'subject have {sent}.')
                    else:
                        chunks.append(f'{n_subject[0]} have {sent}.')
                elif ROOT[0].pos_ == 'ADJ':
                    # Replace determiners or pronouns
                    if n_subject[0].pos_ == 'DET':
                        chunks.append(f'subject be {sent}.')
                    else:
                        chunks.append(f'{n_subject[0]} be {sent}.')
                else:
                    # Replace determiners or pronouns
                    if n_subject[0].pos_ == 'DET':
                        chunks.append(f'subject have {sent}.')
                    else:
                        chunks.append(f'{n_subject[0]} be {sent}.')
                    
    else:
        for sent in sentence_split:
        # Return normal stuff
            chunks.append(f'{sent}.')
        
    # Split some rubbish
    chunks = [re.sub(' +', ' ', chunk) for chunk in chunks]
    chunks = [re.sub('\.+', '.', chunk).strip().capitalize() for chunk in chunks]

    return chunks

'''