In [1]:
import pickle
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import random
import re
import collections
from tqdm.notebook import tqdm as tqdm_notebook
import spacy
import json
import sys
from spacy import displacy
import matplotlib.colors as colors
from matplotlib.colors import is_color_like as color_check

sys.path.insert(0, '../../src/models/')
sys.path.insert(0, '../../src/features/')

from predict_model import loadBERT
from predict_model import SpanPredictor as classify

nlp = spacy.load('en_core_web_trf')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# birds descriptions
data = pickle.load(open('../../data/description/04_TRAIN_0000000-0002000_BIRDS.pkl', 'rb'))

In [3]:
# Open the text file
location = "../../data/external/birds_matched.txt"
with open(location) as f:
    lines = f.readlines()

BOW_list = []
for line in lines[1:]:
    line = line.strip()
    _, _, BOW = line.split(';')
    BOW_list.append(BOW)  

In [4]:
## QUICK FIX

data['Cape Starling'] = [
    "Fairly large, short-tailed glossy starling with rather uniform appearance.",
    "Head is glossy blue, blacker on ear-coverts.",
    "upperparts blue-green with strong iridescence",
    "wing blue-green, dark blue spots at tips of some median and greater coverts.",
    "distinct bronzy-purple epaulet, primaries P6-P9 strongly indented on inner webs",
    "tail glossy blue-green."
    "throat and upper breast have blue iridescence."
    "lower breast, belly and undertail-coverts with greener gloss."
    "iris bright orange-yellow.",
    "bill and legs black.", 
    "Sexes alike.",
    "Juvenile is dull-plumaged, with matt black underparts, iris initially grey.",
    "at three months iris dull yellow, acquiring adult colour after six months",
]
        

In [5]:
# URL
URL = 'https://en.wikipedia.org/wiki/Glossary_of_bird_terms'
# Get the page
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, "lxml", from_encoding="iso-8859-1")   
# Find embedded glossary
glossaries = soup.find_all('dt', {'class': 'glossary'})
parts = [part.text.lower().strip() for part in glossaries]
# Get additional anchors ("also know as...")
glossaries_other = soup.find_all('span', {'class': 'anchor'})
parts_other = [part['id'].lower().strip() for part in glossaries_other]
# Append and drop duplicates
parts = list(set((parts + parts_other)))
# Replace underscore with space
glossary = [part.replace('_', ' ') for part in parts]

In [None]:
with open('../data/description/01_URLS_0020000-0025000_PLANTS.pkl', 'wb') as f:
    pickle.dump(data_links, f)

In [6]:
# A few helpers
additions = [
    'legs', 'beak', 'head', 'wingspan', 'eye', 'forecrown', 'underpart',
]

glossary += additions

compound_list = [
    'fertile', 'sterile',
    'male', 'female', 'bisexual', 'hermaphroditic', 
    'basal', 'developed', 
    'primary', 'secondary', 'main',
    'upper', 'lower', 'greater', 'dorsal', 'alternate', 'lesser', 'apex', 'outer',
    'central', 'outermost', 'outer', 'inner', 'uppermost', 'median', 'dorsal', 'central', 'lateral',
    'young', 'mature', 'individual', 
    'opposite', 'single', 'paired',
    'upper', 'lower', 'dorsal', 'central', 'outermost', 'upperwing', 'underwing',
    'outer', 'sexual',
]

rubbish = [
    '.', ',', '-', '..', '...',
]

measurements = [
    'mm', 'cm', 'm', 'km',
    'milimeter', 'centimeter', 'meter', 'kilometer',
    'milimetre', 'centimetre', 'metre', 'kilometre',
    'inch', 'foot', 'yard', 'mile',
    'wide', 'long', 'broad', 'tall',
    'length', 'form',
]

In [7]:
#glossary

In [8]:
def check_existance(t, doc):
    if t.lemma_ in glossary:
        return t.lemma_
    elif t.text in glossary:
        return t.lemma_

    
def compound_reconstructor(token, doc):
    if token.i == 0:
        trait = token
    elif doc[token.i - 1].pos_ == 'DET':
        trait = token
    elif doc[token.i - 3].dep_ == 'compound':
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 3].text.lower() in compound_list or doc[token.i - 3].lemma_.lower() in compound_list:
        trait = doc[token.i - 3: token.i + 1]
    elif doc[token.i - 2].dep_ == 'compound':
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 2].text.lower() in compound_list or doc[token.i - 3].lemma_.lower() in compound_list:
        trait = doc[token.i - 2: token.i + 1]
    elif doc[token.i - 1].dep_ == 'compound':
        trait = doc[token.i - 1: token.i + 1]
    elif doc[token.i - 1].text.lower() in compound_list or doc[token.i - 3].lemma_.lower() in compound_list:
        trait = doc[token.i - 1: token.i + 1]
    else:
        trait = token
    if ','  in trait.lemma_:
        trait = token
    return trait.lemma_       
    
def text_preparation(species, text):
    cleaners = [(r'(?<!\d)\.(?!\d)', ' '),
                (r'\s×\s', ' times '),
                (r'\s+c\s+', ' '),
                (r'â\x80\x93', ' to '),
                (r'\xa0', ' '),
                (r'\x97', ''),
                (r'\s{2,}', ' '),
                (r'(\D)(\.)', r'\1 '),
                (r'(\d)(\.)(\D)', r'\1 \3'),
                (r'(long,)', r'long and'),
                (r'(wide,)', r'wide and'),
               ]
    
    species_parts = species.split()
    candidates = [' '.join(species_parts[:idx+1]) for idx, _ in enumerate(species_parts)]
    if len(species_parts) > 1:
        candidates += [
            f'{species_parts[0][0]}. {species_parts[1]}'
        ]
    else:
        pass
    candidates.reverse()
    for candidate in candidates:
        try:
            text = re.sub(candidate, 'the species', text)
        except:
            continue # Skip species with brackets for now
    for (cleaner, replacement) in cleaners:
        text = re.sub(cleaner, replacement, text)    
    text = f'{text.strip()}.'
    return text.capitalize()


def extract_modifiers(t, doc):
    if t.text.lower() not in compound_list:
        if t.dep_ in ['amod', 'nummod', 'appos', 'acl','prep', 'conj']:
            return doc[t.left_edge.i : t.right_edge.i + 1]
    
        
def create_relation(t):
    relation = 'has property'
    #if t in measurements or list(set(t.split()) & set(measurements)):
    #    relation = 'measures'  #'measurement'
    #elif t.isdigit():
    #    relation = 'has number'
    #elif color_check(t) or color_check(t.split()[-1]) or color_check(t.split('-')[-1]):
    #    relation = 'has color'
    #else:
    #    relation = 'is' # Property
        
    return relation
        
def clean_object(t):
    
    #print(chunk_1, chunk_2)
    if t.root.pos_ == 'NOUN' and t.root.lemma_ not in measurements:
        objects = t.root.lemma_
    else:
        if len(t) > 1:
            objects =  t.text
        elif t.root.pos_ == 'VERB':
            objects = t.text
        else:
            objects = t.lemma_

    objects =  re.split(',| and | or | with ', objects)

    return [obj.strip() for obj in objects if obj if obj not in rubbish]

def extract_verb(t, doc):
    if t.dep_  == 'nsubj' or t.dep_ == 'nsubjpass':
        return next((parent for parent in t.ancestors if parent.pos_ == 'VERB' or parent.pos_ == 'AUX'), None)

def extract_verbal_modifier(t, doc):
    if t.text.lower() not in compound_list:
        if child.dep_ in ["acomp", "dobj", "prep", 'attr']:
            return doc[child.left_edge.i : child.right_edge.i + 1]    
    
def create_main_triples(part, trait, obj):
    triples = []
    triples.append(('species', 'has main part', part.lower()))
    triples.append((part.lower(), f'has part', trait.lower()))
    for o in obj:
        rel = create_relation(o)
        triples.append((trait.lower(), rel.lower(), o.lower()))
    return triples

def create_sub_triples(sub, obj):
    triples = []
    for o in obj:
        rel = create_relation(o)
        triples.append((sub.lower(), rel.lower(), o.lower()))
    return triples

def noun_check(t):
    if t.root.pos_ == 'NOUN' and t.root.lemma_ not in measurements and not color_check(t.root.lemma_):
        return True
    else:
        return True

In [12]:
descriptions = collections.defaultdict(list)

for species in tqdm_notebook(list(data.keys())[0:]):
    if species not in BOW_list:
        continue
    for idx, text in enumerate(data[species][0:]):
        #print(text)
        triples = []
        text = text_preparation(species, text)
        doc = nlp(text)
        
        # TEMP ESCAPE
        try:
            if doc[-2].text in rubbish and doc[-3].text in rubbish:
                continue
            spaces = [t for t in doc if t.pos_ == 'SPACE']
            if len(spaces) > 1 and 'species' in doc.text.lower():
                continue
            if doc[-1].text in ['..', '...']:
                continue
        except:
            pass

        
        for t in doc:

            if t.pos_ == 'NOUN' or t.pos_ == 'PROPN' or t.pos_ == 'PRON':
                if t.dep_ == 'compound':
                    continue
                part = check_existance(t, doc)
                if part: 
                    #print(part)
                    # Trait
                    trait = compound_reconstructor(t, doc)
                    ## ADJs and NOUNs
                    #print(t)
                    for child in t.children:
                        obj_tok  = extract_modifiers(child, doc)
                        if obj_tok:
                            obj = clean_object(obj_tok)
                            triples += create_main_triples(part, trait, obj)
                            # modifiers of NOUNS
                            if noun_check(obj_tok):
                                for child in obj_tok.root.children:
                                    obj_tok = extract_modifiers(child, doc)
                                    if obj_tok:
                                        obj_new = clean_object(obj_tok)
                                        triples += create_sub_triples(obj[0], obj_new)
                                        
                    ## VERBs
                    verb = extract_verb(t, doc)
                    if verb:
                        for child in verb.children:
                            obj_tok  = extract_verbal_modifier(child, doc)
                            #print(obj_tok)
                            if obj_tok:
                                obj = clean_object(obj_tok)
                                #print(obj)
                                triples += create_main_triples(part, trait, obj)
                            
                        
        #print(text)
        #print(idx, triples)
        #print('\n')
        descriptions[species] += triples

  0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
#descriptions['Tree Swallow']

In [16]:
#len(descriptions.keys())

In [18]:
information = collections.defaultdict(list)

for bird in tqdm_notebook(descriptions.keys()):
    #if bird != 'Tree Swallow':
    #    continue
    for (sub, rel, obj) in descriptions[bird][0:]:
        
        if rel == 'has part':
            continue
        if sub == obj:
            continue
        if sub == 'species':
            continue
        x = nlp(sub)    
        y = nlp(obj)
        if len(x) == 1:  
            if len(y) == 1:
                if x[0].pos_ == 'NOUN':
                    if y[0].pos_ == 'NOUN':
                        continue
        text = f'{sub} {obj}.'.capitalize()
        information[bird].append(text)

  0%|          | 0/200 [00:00<?, ?it/s]

In [19]:
information_set = collections.defaultdict(list)

for bird in information.keys():
    #for information[bird]:
    information_set[bird] = list(set(information[bird]))

In [21]:
#information_set['Tree Swallow']

In [None]:
#information

In [22]:
df = pd.DataFrame.from_dict(information_set, orient='index')

In [23]:
df.to_csv('bird_traits.csv')

columns

In [24]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,736,737,738,739,740,741,742,743,744,745
Tree Swallow,Feather nape.,Feather bold.,Orange at edge.,Inner secondary covert.,Feather few to extensive.,Sexual dimorphism limited.,Plumage streaked underparts.,Underwing dark.,Primary feather nine.,Usually slightly duller than that of males tha...,...,,,,,,,,,,
American Redstart,Molt prealternate.,Part of a bird.,The yellow coloration to the tail averages les...,Juvenile early formative.,Male but rectrix especially on r3.,Molt to a prealternate molt.,Upperwe secondary covert yellow patch.,Plumage in both sexes.,Plumage similar to that of definitive basic fe...,Plumage other.,...,,,,,,,,,,
Bewick's Wren,Partly edged with dull gray.,Feather fresh.,Rump with large concealed subterminal white sp...,Upperpart bold.,Feather edge.,Undertail covert pale buff.,Plain white plain.,Primary covert paler.,Eyeline white.,Secondary covert primary-.,...,,,,,,,,,,
Belted Kingfisher,Crest shaggy.,White more.,Band across the lower breast.,Plumage rivers earlier than diving for fish.,Molt first.,Body plumage breast.,Breast decreased.,Side of head.,Call of adults.,Rectrix secondary.,...,,,,,,,,,,
Cape May Warbler,Olive-green olive.,Feather on underparts.,Feather retained.,Streaking ventral.,Crown overall appearing blackish.,Crown gray.,Eyeline white.,Feather edging on side throat.,Outer 2 feather on distal portion of inner web.,Plumage other.,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yellow-billed Cuckoo,Primary flight feather red.,Bill tarsus.,Tail tarsus.,Hair plain.,"""first basic"" ""basic i.",Hair olive.,Plumage mostly gray.,Indistinguishable from formative plumage from ...,Bill with yellow especially on the lower mandi...,Underpart black.,...,,,,,,,,,,
Gray-crowned Rosy-Finch,Area upper mandible.,Molt prealternate.,Feather of forecrown.,Body-feather but no tertials.,Tail 65.8.,Feather edge.,Gray in some races.,Crown gray.,Depth 7.23.,Outer juvenile median yellow-edged.,...,,,,,,,,,,
Scissor-tailed Flycatcher,Down sparse.,Inner rectrix six.,Tertial covert.,Upperwing secondary covert grayish white.,Feather retained.,Gray dark.,Margined terminally white.,Most all.,"""first basic"" ""basic i.",Body plumage similar to definitive basic female.,...,,,,,,,,,,
Arctic Tern,Rump contrasting.,Scapula fringed brown.,Variably gray pattern.,Wing scaly.,Uniformly pale-gray pale.,Beat stiff.,Wing more rounded.,Outer primary darker gray.,Wing loading.,Molt extent.,...,,,,,,,,,,
