In [3]:
import numpy as np
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import re
import collections
import glob
from itertools import islice

import sys
import time
sys.path.insert(0, '../src/models/')
sys.path.insert(0, '../src/features/')

from build_features import similarity_matrix, name_cleaner, yield_chunks
from transformers import DistilBertTokenizer, DistilBertModel

%matplotlib inline

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Load data

In [4]:
root = "../data/processed/"
datadict = {}
# Load the pickle list
data_files = glob.glob(root+ 'descriptions*PLANTS.pkl')
for data_file in data_files:
    # Open the dict and update
    datadict.update(pickle.load(open(data_file, 'rb')))

#### Drop double Sentences

In [None]:
data_dissimilar = collections.defaultdict(list)

# Drop double sentences
for key, values in tqdm(datadict.items()):
    # Drop URLS
    values = [sent for (sent, URL) in values]
    # Get similarity matrix
    matrix = similarity_matrix(values)
    # Extract indices with threshold
    indices = np.transpose((matrix>0.99).nonzero())
    # Get doubles
    if len(indices) > 1:
        doubles = [values[idx_y] for (idx_x, idx_y) in indices if idx_x != idx_y]
        # drop last half of list
        doubles = doubles[len(doubles)//2:]
        # sentences non double
        sents_nodouble = [sent for sent in values if sent not in doubles]
        # Replace data
        data_dissimilar[key] = sents_nodouble

#### Create single chunks

In [None]:
# Init dict
data = collections.defaultdict(list)

for key, values, in tqdm(datadict.items()):
#for key, values, in tqdm(data_dissimilar.items()):
    
    #### REMOVE THIS IN FUTURE ####
    # Drop URLS
    values = [sent for (sent, URL) in values]
    #### REMOVE THIS IN FUTURE ####
    
    # Remove species names
    sents_noname = [name_cleaner(key, sent, 'the species') for sent in values]
    # Cut into chunks
    for sent in sents_noname:
        #data[key].append(yield_chunks(sent))
        chunks = yield_chunks(sent)
        for chunk in chunks:
            data[key].append(chunk)


In [None]:
# Dump pickle into file
with open('../data/processed/TEST_TaxonomistDescriptions_PLANTS.pkl', 'wb') as f:
    pickle.dump(data, f)

In [3]:
data = pickle.load(open('../data/processed/TEST_TaxonomistDescriptions_PLANTS.pkl', 'rb'))

## Visualisation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

##### Descriptions per Species

In [None]:
# Change this to the dissimilar in future
counts = [len(sents) for key, sents in data.items()]

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
sns.set_style('whitegrid')
ax = sns.histplot(counts)
ax.set_title('Histogram')
plt.xlim(0, 300)

##### Sentence Similarity

In [None]:
sents = data['Abies araucana']
#max_key, max_value = max(data.items(), key = lambda x: len(set(x[1])))

In [None]:
sents

In [None]:
matrix = similarity_matrix(sents)

In [None]:
mask = np.zeros_like(matrix)
mask[np.triu_indices_from(mask)] = True

#plt.figure(figsize=(16, 16),)
fig, ax = plt.subplots(figsize=(10, 10))
ax = sns.heatmap(matrix, 
                 linewidths=.5, 
                 square=True, annot=True, 
                 cmap='Spectral_r', 
                 mask=mask, 
                 cbar=False)
ax.set_title('Similarity Matrix')
#fig.savefig('similarity_matrix.svg', format='svg', dpi=1200)

In [None]:
print(sents[7])
print(sents[2])

In [5]:
import spacy
import re
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.util import filter_spans
nlp = spacy.load("en_core_web_trf")


In [6]:
sents = [sent for (sent, _) in datadict['Adansonia situla']]

In [7]:
sents

['Hard, dark brown to black, kidney-shaped.',
 'Large , heavy white and sweetly scented that hang down on long stalks.',
 'The bark is greyish brown and normally smooth but can often be variously folded and seamed from years of growth.',
 'Palmately compound and clustered at the ends of short, stocky branches.',
 'The main stem of larger baobab trees may reach enormous proportions of up to 28 m in girth usually squat cylindrical trunk gives rise to thick tapering branches resembling a root-system, which is why it has often been referred to as the upside-down tree.',
 'The flowers are 10-20 cm across and have waxy crinkled petals about 10 cm long that surround dense clusters of purple stamens that look like powder puffs.',
 'The flowers fall within 24 hours, turning brown and smelling quite unpleasant.',
 "African baobab. is one of the plant kingdom's strangest tree, it is not particularly tall growing but the trunk is massive: as much as 11 m in diameter and shaped like a bottle.",
 'T

In [49]:
def yield_subjects(sentence):
    chunks = []

    doc = nlp(sentence)
    # Find Normal subject
    n_subject = [token for token in doc if token.dep_ == 'nsubj' 
                                        or token.pos_ == 'PROPN'
                                        or token.pos_ == 'NOUN' and token.dep_ == 'ROOT']


    #sentence_split = sentence.split(',')
    
    sentence_split = re.split(',|and ', sentence)

    
    # If no subject is found
    if n_subject == []:
        for sent in sentence_split:
            sent = nlp(sent)
            # Try to find new normal subject
            n_subject = [token for token in sent if token.dep_ == 'nsubj' or token.pos_ == 'PROPN']
            #print(n_subject)
            # If still no normal subject use species
            if n_subject == []:

                # Find Root
                ROOT = [token for token in sent if token.dep_ == 'ROOT']
                #print(ROOT[0], ROOT[0].pos_, ROOT[0].dep_)
                if ROOT[0].pos_ == 'NOUN':
                    # If NOUN 'have'
                    chunks.append(f'species have {sent.text}.')
                else:
                    chunks.append(f'species be {sent.text}.')
            else:
                chunks.append(f'{sent}.')
    elif len(n_subject) == 1:
        for sent in sentence_split:
            sent = nlp(sent)
            if n_subject[0].text in sent.text:
                chunks.append(f'{sent}.')
            else:
                # Find Root
                ROOT = [token for token in sent if token.dep_ == 'ROOT']
                if ROOT[0].pos_ == 'NOUN':
                    # Replace determiners or pronouns
                    if n_subject[0].pos_ == 'DET':
                        chunks.append(f'subject have {sent}.')
                    else:
                        chunks.append(f'{n_subject[0]} have {sent}.')
                elif ROOT[0].pos_ == 'ADJ':
                    # Replace determiners or pronouns
                    if n_subject[0].pos_ == 'DET':
                        chunks.append(f'subject be {sent}.')
                    else:
                        chunks.append(f'{n_subject[0]} be {sent}.')
                else:
                    # Replace determiners or pronouns
                    if n_subject[0].pos_ == 'DET':
                        chunks.append(f'subject have {sent}.')
                    else:
                        chunks.append(f'{n_subject[0]} be {sent}.')
                    
    else:
        for sent in sentence_split:
        # Return normal stuff
            chunks.append(f'{sent}.')
        
    # Split some rubbish
    chunks = [re.sub(' +', ' ', chunk) for chunk in chunks]
    chunks = [re.sub('\.+', '.', chunk).strip().capitalize() for chunk in chunks]

    return chunks


In [50]:
cleaned = [yield_subjects(sent) for sent in tqdm(sents)]


  0%|                                                 | 0/11 [00:00<?, ?it/s][A
  9%|███▋                                     | 1/11 [00:00<00:02,  4.20it/s][A
 18%|███████▍                                 | 2/11 [00:00<00:02,  3.92it/s][A
 27%|███████████▏                             | 3/11 [00:00<00:02,  3.60it/s][A
 36%|██████████████▉                          | 4/11 [00:01<00:01,  3.70it/s][A
 45%|██████████████████▋                      | 5/11 [00:01<00:01,  4.26it/s][A
 55%|██████████████████████▎                  | 6/11 [00:01<00:00,  5.03it/s][A
 64%|██████████████████████████               | 7/11 [00:01<00:00,  4.65it/s][A
 73%|█████████████████████████████▊           | 8/11 [00:01<00:00,  5.13it/s][A
 82%|█████████████████████████████████▌       | 9/11 [00:01<00:00,  5.43it/s][A
 91%|████████████████████████████████████▎   | 10/11 [00:02<00:00,  5.32it/s][A
100%|████████████████████████████████████████| 11/11 [00:02<00:00,  4.73it/s][A


In [51]:
cleaned

[['Species be hard.',
  'Species be dark brown to black.',
  'Species be kidney-shaped.'],
 ['Subject be large .',
  'Subject have heavy white .',
  'Sweetly scented that hang down on long stalks.'],
 ['The bark is greyish brown .',
  'Bark be normally smooth but can often be variously folded .',
  'Bark be seamed from years of growth.'],
 ['Palmately compound .',
  'Clustered at the ends of short.',
  'Species have stocky branches.'],
 ['The main stem of larger baobab trees may reach enormous proportions of up to 28 m in girth usually squat cylindrical trunk gives rise to thick tapering branches resembling a root-system.',
  'Which is why it has often been referred to as the upside-down tree.'],
 ['The flowers are 10-20 cm across .',
  'Have waxy crinkled petals about 10 cm long that surround dense clusters of purple stamens that look like powder puffs.'],
 ['The flowers fall within 24 hours.',
  'Flowers be turning brown .',
  'Flowers be smelling quite unpleasant.'],
 ["African baob

In [70]:
test = nlp('Large , heavy white and sweetly scented that hang down on long stalks')

In [71]:
displacy.render(test)

In [72]:
for t in test:
    print(t.dep_, t.pos_)

amod ADJ
punct PUNCT
amod ADJ
conj ADJ
cc CCONJ
advmod ADV
conj ADJ
nsubj DET
ROOT VERB
advmod ADV
prep ADP
amod ADJ
pobj NOUN
ROOT PUNCT


In [105]:
sents_test = 'This is a tree, brown and with white flowers.'

In [119]:
yield_subjects(sents_test)

['This is a tree.', 'Subject have brown .', 'Subject have with white flowers.']

In [68]:
def yield_subjects(sentence):
    chunks = []

    doc = nlp(sentence)
    # Find Normal subject
    n_subject = [token for token in doc if token.dep_ == 'nsubj' 
                                        or token.pos_ == 'PROPN'
                                        or token.dep_ == 'nsubjpass' 
                                        or token.pos_ == 'NOUN' and token.dep_ == 'ROOT']


    #sentence_split = sentence.split(',')
    
    sentence_split = re.split(',|and ', sentence)

    
    # If no subject is found
    if n_subject == []:
        for sent in sentence_split:
            sent = nlp(sent)
            # Try to find new normal subject
            n_subject = [token for token in sent if token.dep_ == 'nsubj' or token.pos_ == 'PROPN']
            # If still no normal subject use species
            if n_subject == []:
                # Find Root
                ROOT = [token for token in sent if token.dep_ == 'ROOT']
                #print(ROOT[0], ROOT[0].pos_, ROOT[0].dep_)
                if ROOT[0].pos_ == 'NOUN':
                    # If NOUN 'have'
                    chunks.append(f'species have {sent.text}.')
                else:
                    chunks.append(f'species be {sent.text}.')
            else:
                chunks.append(f'species be {sent}.')
                
    elif len(n_subject) == 1 and n_subject[0].pos_ == 'DET':
        print('xxx', n_subject)
                
    else:
        print(n_subject)
           
    # Split some rubbish
    chunks = [re.sub(' +', ' ', chunk) for chunk in chunks]
    chunks = [re.sub('\.+', '.', chunk).strip().capitalize() for chunk in chunks]

    return chunks


In [69]:
cleaned = [yield_subjects(sent) for sent in sents]

xxx [that]
[bark]
[stem, trunk, which, it]
[flowers, that, that]
[flowers]
[baobab, it, trunk]
[They]
[trunk, It, trees, trunks, tree, stem, which]
[flowers]


In [59]:
sents

['Hard, dark brown to black, kidney-shaped.',
 'Large , heavy white and sweetly scented that hang down on long stalks.',
 'The bark is greyish brown and normally smooth but can often be variously folded and seamed from years of growth.',
 'Palmately compound and clustered at the ends of short, stocky branches.',
 'The main stem of larger baobab trees may reach enormous proportions of up to 28 m in girth usually squat cylindrical trunk gives rise to thick tapering branches resembling a root-system, which is why it has often been referred to as the upside-down tree.',
 'The flowers are 10-20 cm across and have waxy crinkled petals about 10 cm long that surround dense clusters of purple stamens that look like powder puffs.',
 'The flowers fall within 24 hours, turning brown and smelling quite unpleasant.',
 "African baobab. is one of the plant kingdom's strangest tree, it is not particularly tall growing but the trunk is massive: as much as 11 m in diameter and shaped like a bottle.",
 'T

In [60]:
cleaned

[['Species be hard.',
  'Species be dark brown to black.',
  'Species be kidney-shaped.'],
 [],
 [],
 ['Species be palmately compound .',
  'Species be clustered at the ends of short.',
  'Species have stocky branches.'],
 [],
 [],
 [],
 [],
 [],
 [],
 []]