In [14]:
import numpy as np
import pandas as pd
import torch
import pickle
import re
import requests
from matplotlib import cm
import matplotlib
from bs4 import BeautifulSoup
import collections
from itertools import chain
from collections import Counter
import torch.nn as nn
import glob
import random
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset, random_split
from tqdm import tqdm
import time
import urllib.parse

import spacy
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.util import filter_spans
nlp = spacy.load("en_core_web_lg")

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

import sys
sys.path.insert(0, '../src/models/')
sys.path.insert(0, '../src/features/')
#sys.path.insert(0, '../src/visualization/')

import predict_model
#import visualize as vis

model = predict_model.loadBERT("../models/", 'saved_weights_inf_FIXED.pt')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

%matplotlib inline

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CPU Success


In [22]:
def search_Duck(query):
    
    """
    Queries DuckDuckGo and returns a URL list.
    """
    
    # Get results 
    page = requests.get('https://duckduckgo.com/html/?q={0}'.format(query), 
                        headers={'user-agent': 'Descriptor/0.0.1'})
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all('a', attrs={'class':'result__a'}, href=True)
    # Init list
    links = []
    # Clean results
    for link in results:
        url = link['href']
        o = urllib.parse.urlparse(url)
        d = urllib.parse.parse_qs(o.query)
        links.append(d['uddg'][0])
    return links

def search_Bing(query):
    
    """
    Queries Bing and returns a URL list.
    """
    
    # Get results
    page = requests.get('https://www.bing.com/search?form=MOZLBR&pc=MOZI&q={0}'.format(query), 
                        headers={'user-agent': 'Descriptor/0.0.1'})
    soup = BeautifulSoup(page.content, 'html.parser')
    # Init list
    links = [] 
    # Clean results
    for i in soup.find_all('a', attrs={'h':re.compile('ID=SERP.+')}, href=True):
        link = i['href']
        if link.startswith('http') and 'microsoft' not in link and 'bing' not in link:
            links.append(link)        
    return links

def SpanPredictor(span, pred_values=False):
    
    """
    Uses a trained bert classifier to see if a span
    belongs to a species description or otherwise.
    """
         
    with torch.no_grad():
        # Tokenize input
        inputs = tokenizer(span, return_tensors="pt", truncation=True)
        # Predict class
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        # Get prediction values
        exps = torch.exp(outputs)
        # Get class
        span_class = exps.argmax(1).item()

        # Print the prediction values
        if pred_values:
            return span_class, exps[0]
        else:
            return span_class
        
def text_cleaner(soup, per_sent=True):
    
    """
    Cleans the contents of a bs4 object and uses SpaCy to return single sentences.
    """    
    
    regexes = [
        (r'\(\d+.+?Close\n\t\n\)', ''),
        (r'\(.+?\)', ''),
        (r'\[.+?\]', ''),
        (r'cm\.', 'centimeters'),
        (r'm\.', 'meters'),
        (r'ft\.', 'feet'),
        (r'\.\.\.', '.'),
        (r'\.\s*\.', '.'),
    ]

    # Get text
    dirty_text = soup.get_text(". ", strip=True)
    # Clean text
    # Clean text
    for regex, replace in regexes:
        dirty_text = re.sub(regex, replace, dirty_text)
    # Clean stuff
    text = dirty_text.replace('\r', "")\
                 .replace('\n', "")\
                 .replace('\t', "")\
                 .strip()
                 #.encode("ascii", "ignore")\
                 #.decode()\
    
    #nlp
    doc = nlp(text)
    sents = [i for i in doc.sents]
    
    sents_clean = []
    # Clean non English
    for sentence in sents:
        # Skip short stuff
        if len(sentence) <= 5:
            continue
        # Create ratio
        non_eng = [token.is_oov for token in sentence].count(True)
        # Continue if the ratio is bad (non English jibberisch)
        if non_eng != 0:
            if non_eng / len(doc) > .2:
                continue
        sents_clean.append(sentence)
    
    sents_clean = list(set(sents_clean))
    
    if per_sent:
        return sents_clean
    else:
        return doc

def VisualizeDoc(text, per_sentence=False, save=False):
    
    """
    Creates and HTML file (can be rendered in a notebook) by using the SpaCy 
    Displacy.
    
    per_sentence : Returns the visualization per sentence instead of a whole doc.
    save         : If True returns the html string to save.
    """
    
    # nlp the text
    doc = nlp(text)
    # Extract the sents
    sentences = [i for i in doc.sents]
    # Init color map
    cmap = cm.get_cmap('Spectral')
    # Init color dict
    colors = {}
    # Init option dict
    options = {"ents": [],
               "colors": colors,
               "distance": 75}
    # Init matcher
    matcher = PhraseMatcher(nlp.vocab)
    # Loop over the sentences
    for idx, sentence in enumerate(sentences):
        
        # Get the prediction values    
        prediction = SpanPredictor(str(sentence), pred_values=True)[1][1].numpy().item()
        
        # String ID            
        #text = '#{0} - {1:.2f}'.format(idx, prediction)
        text = f'{prediction:.3f}'
        # Add the patterns        
        pattern = nlp(str(sentence))
        matcher.add(text, None, pattern)

        # Colorize the strings
        if prediction > .5:
            colors[text] = matplotlib.colors.rgb2hex(cmap(prediction))
        else:
            colors[text] = matplotlib.colors.rgb2hex(cmap(prediction)) + '60'
        # Add the new ENTS to the doc
        options["ents"].append(text)

    # Match the enitities in the doc
    matches = matcher(doc)
    # Reset the current ENTS
    doc.ents = ()
    # Loop over the matches
    for match_id, start, end in matches:
        # Add the sentencen as a ENT
        span = Span(doc, start, end, label=match_id)
        #doc.ents = filter_spans(doc.ents)
        try:
            doc.ents = list(doc.ents) + [span]
        except:
            continue
            
    # Set title
    #doc.user_data["title"] = "Description Predictor"
    sentence_spans = list(doc.sents)
    
    if save and per_sentence:
        return displacy.render(sentence_spans, style='ent', options=options)
    elif save and not per_sentence:
        return displacy.render(doc, style='ent', options=options)
    elif not save and per_sentence:
        displacy.render(sentence_spans, style='ent', options=options)
    elif not save and not per_sentence:
        displacy.render(doc, style='ent', options=options)

In [16]:
# Create lists
plants_dict = pickle.load(open('../data/processed/train_dataPOWO.pkl', 'rb'))
birds_dict = pickle.load(open('../data/processed/descriptions_web_birds_bow.pkl', 'rb'))

plants_list = [keys for keys, values in plants_dict.items()]
birds_list = [keys for keys, values in birds_dict.items()]

plants_list_r = random.sample(plants_list, len(birds_list))

In [17]:
species_list = plants_list_r + birds_list
random.shuffle(species_list)

In [18]:
len(species_list)

17872

In [23]:
# Init dict
data = collections.defaultdict(list)
# Init dict

# DEBUGGING
data_link = collections.defaultdict(list)
data_with_source = collections.defaultdict(list)

query = 'description'

for count, species in enumerate(tqdm(species_list[0:2])):
    
    # Empty list
    search_links = []
    # create q
    species_q = species.replace(' ', '+')
    species_q = '"{0}"+{1}'.format(species_q, query)
    # species_q = f'"{species_q}"+{query}'
    
    search_links += search_Duck(species_q)
    search_links += search_Bing(species_q)

    # Drop duplicates
    search_links = list(set(search_links))
    # DEBUGGING
    data_link[species] += search_links
    
    # Loop over the URLs
    for URL in search_links:
        # Skip google archives
        if 'google' in URL:
            continue
        # PDF and TXT
        if URL.endswith('txt') or URL.endswith('pdf'):
            
            """
            Continue for now, insert the pdf processor here
            """
            continue
            
        try:
            #print(URL)
            page = requests.get(URL, timeout=5)
            # Skip PDF files for now
            if page.headers['Content-Type'].startswith('application/pdf'):
                
                """
                Continue for now, insert the pdf processor here
                """
                continue
                
            # Soup the result
            soup = BeautifulSoup(page.content, 'html.parser')
            # Skip Embedded PDF's
            if 'pdf' in soup.title.text.lower():
                continue

            # Clean the soup and break into sents
            sentences = text_cleaner(soup)
            # Loop over the individual sentences
            for sentence in sentences:                    
                # Create string object
                sentence_str = str(sentence)
                
                if SpanPredictor(sentence_str):
                    #print(URL)
                    data[species].append(sentence_str)
                    data_with_source[species].append(tuple([sentence_str, URL]))
        except: 
            continue

100%|█████████████████████████████████████████████| 2/2 [02:08<00:00, 64.41s/it]


In [20]:
data.keys()

dict_keys(['Oriolus flavocinctus', 'Sicalis citrina'])

In [25]:
data_with_source['Sicalis citrina']

[('Orange fronted Yellow finch Sicalis columbiana regnum =',
  'https://es-academic.com/dic.nsf/eswiki/1086796'),
 ('Animalia phylum = Chordata classis =',
  'https://es-academic.com/dic.nsf/eswiki/1086796'),
 ('Animalia phylum = Chordata classis =',
  'https://es-academic.com/dic.nsf/eswiki/1086796'),
 ('Sicalis. — flaveola ….', 'https://es-academic.com/dic.nsf/eswiki/1086796'),
 ('– cordés, cordado, chordates.',
  'https://itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=562993'),
 ('Golden Bellied Sicalis Lemon Sicalis Dwarf Sicalis Saffron sicalis , or saffron finchPatagonian Sicalis Yellow Sicalis Yellow-headed Sicalis Short-billed Sicalis Olive',
  'https://www.wikipe.wiki/wiki/ru/Sicalis'),
 ('yellow, citrine \xa0< L. citrus.',
  'https://birdsoftheworld.org/bow/species/styfin1/cur/introduction'),
 ('Black-spotted Bare-eye .',
  'https://www.peruaves.org/thraupidae/stripe-tailed-yellow-finch-sicalis-citrina/'),
 ('Brown-rumped Foliage-gleaner .',
  'https://www

In [None]:
#data_with_source[species]
#data_link[species]
print(len(data_with_source['Spatula querquedula']))
_ = list(set(data_with_source['Spatula querquedula']))
print(len(_))

In [None]:
URL = 'http://db.worldagroforestry.org//species/properties/Enterolobium_cyclocarpum' 
#URL = 'http://powo.science.kew.org/taxon/757855-1'
#URL = 'https://birdsoftheworld.org/bow/species/gargan/cur/introduction'
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, 'html.parser')
text = str(text_cleaner(soup, per_sent=False))
VisualizeDoc(text, per_sentence=False)