In [1]:
import spacy
import pickle
from spacy import displacy
nlp = spacy.load('en_core_web_trf')
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm as tqdm_notebook
import collections
import re
import pandas as pd
import numpy as np
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# URL
URL = 'https://en.wikipedia.org/wiki/Glossary_of_bird_terms'
# Get the page
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, "lxml", from_encoding="iso-8859-1")   
# Find embedded glossary
glossaries = soup.find_all('dt', {'class': 'glossary'})
parts = [part.text.lower().strip() for part in glossaries]
# Get additional anchors ("also know as...")
glossaries_other = soup.find_all('span', {'class': 'anchor'})
parts_other = [part['id'].lower().strip() for part in glossaries_other]
# Append and drop duplicates
parts = list(set((parts + parts_other)))
# Replace underscore with space
glossary = [part.replace('_', ' ') for part in parts]

In [142]:
additions = [
    'legs',
    'beak',
    'head',
    'wingspan',
    'eye',
]

glossary += additions

In [126]:
data = pickle.load(open('../../data/description/04_TRAIN_0000000-0002000_BIRDS.pkl', 'rb'))


In [135]:
with open('../../data/processed/PoS_descriptions_common_birds_v2.pkl', 'wb') as f:
    pickle.dump(attribution_words, f)

In [5]:
## COMON BIRDS
common_birds = pd.read_csv('../../data/external/birds_matched.txt', sep=';')  

In [147]:
descriptions = collections.defaultdict(list)

compounds = [
    'upper',
    'lower',
    'dorsal',
    'central',
    'outermost',
    'upperwing',
    'underwing',
    'tail',
    'outer',
    'sexual',   
]

sex_determ = [
    'juvenile',
    'male',
    'female',
    'adult',
]


for bird in tqdm_notebook(list(data.keys())[0:5], desc='bird'):
    
    #if bird not in common_birds['BOWO'].values:
    #if bird != 'Gray Catbird':
    #    continue

    for string in data[bird][0:20]:
                
        # NLP
        doc = nlp(string)
        #print(doc)
        # Loop over tokens
        for token in doc:
            # Check if exists within glossary
            if token.lemma_.lower() in glossary or token.text.lower() in glossary:
                
                #print(doc)
                                
                
                ### CREATE TRAIT ###
                # Reset trait
                trait = ''
                # Set trait
                trait = token.lemma_.lower()
                # Correct trait
                if doc[token.i - 1].lemma_ in compounds:
                    trait = doc[token.i - 1: token.i + 1].lemma_.lower()
                
                ### NORMAL SUBJECTS ###
                if token.dep_ in ['nsubj', 'nsubjpass']:
                    # Get first parent
                    parent = next(token.ancestors)
                    if parent.lemma_ == 'be':
                        for child in parent.children:
                            if child.dep_ == 'acomp':
                                descriptions[bird].append(('bird', 'has', trait))
                                descriptions[bird].append((trait, 'be', child.text))
                    
                    else:
                        for child in parent.children:
                            if child.dep_ == 'advmod':
                                descriptions[bird].append(('bird', 'has', trait))
                                descriptions[bird].append((trait, parent.text, child.lemma_))
                            
                            elif child.dep_ == 'auxpass':
                                descriptions[bird].append(('bird', 'has', trait))
                                descriptions[bird].append((trait, 'be', parent.text))

                ### GET ADJs and VERBs ###
                for child in token.children:
                    # Skip locations
                    if child.lemma_ in compounds:
                        continue
                    # Skip
                    if child.dep_ == 'acl':
                        continue
                        
                    ### VERB OF NOUN ###
                    if child.pos_ == 'VERB':
                        descriptions[bird].append(('bird', 'has', trait))
                        descriptions[bird].append((trait, 'be', child.text)) 
                        #textual_list[bird].append(f'The bird has {trait}, the {trait} is {child.text}.')
                    if child.pos_ == 'ADJ':
                        descriptions[bird].append(('bird', 'has', trait))
                        descriptions[bird].append((trait, 'be', child.lemma_)) 

                    '''
                    ### NUMBERS ###
                    if child.pos_ == 'NUM':
                        # Number
                        descriptions[bird].append(('bird', 'has', trait))
                        descriptions[bird].append((trait, 'number', child.lemma_)) 
                   '''     
                    
        descriptions[bird] = list(set(descriptions[bird]))


bird:   0%|          | 0/5 [00:00<?, ?it/s]

In [137]:
#with open('../../data/processed/PoS_descriptions_common_birds.pkl', 'wb') as f:
#    pickle.dump(textual_list, f)

In [146]:
descriptions

defaultdict(list,
            {'Gray Catbird': [('eye', 'be', 'grayish'),
              ('wing', 'be', 'short'),
              ('covert', 'be', 'buffy'),
              ('tail', 'be', 'long'),
              ('bird', 'has', 'tail'),
              ('bird', 'has', 'wing'),
              ('bird', 'has', 'eye'),
              ('bird', 'has', 'covert'),
              ('eye', 'be', 'juvenile')]})

In [139]:
doc

When this display is given as threat to potential nest predator, it is accompanied by soft, shrill notes .

In [104]:
displacy.render(doc)

In [78]:
source   = []
relation = []
target   = []

for bird in descriptions.keys():
    for triple in descriptions[bird]:
        if not all(triple):
            continue
        if '-' in triple:
            continue
        source.append(triple[0])
        relation.append(triple[1])
        target.append(triple[2])
    
kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relation})

In [None]:
source   = []
relation = []
target   = []

for triple in descriptions['Kioea']:
    if not all(triple):
        continue
    source.append(triple[0])
    relation.append(triple[1])
    target.append(triple[2])
    
kg_df_Kioea = pd.DataFrame({'source':source, 'target':target, 'edge':relation})
melt = pd.melt(kg_df_Kioea)

In [None]:
descriptions.keys()

In [None]:
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(30, 30))
labels = dict(zip(list(zip(kg_df.source, kg_df.target)),
                  kg_df['edge'].tolist()))
pos = nx.spring_layout(G, k = 1)

color_map = []
for node in G:
    if node in melt['value'].values:
        color_map.append('orange')
    else: 
        color_map.append('grey')  

nx.draw(G,
        node_color=color_map,
        with_labels=True, 
        node_size=2500, 
        pos=graphviz_layout(G), )
nx.draw_networkx_edge_labels(G, 
                             pos=graphviz_layout(G), 
                             edge_labels=labels,
                             font_color='red', )
#plt.show()
#plt.savefig('plot.pdf', format='pdf', dpi=1200, bbox_inches='tight')

In [None]:
kg_df