In [2]:
import numpy as np
import pandas as pd
import torch
import pickle
import re
import requests
from matplotlib import cm
import matplotlib
from bs4 import BeautifulSoup
import collections
from itertools import chain
from collections import Counter
import torch.nn as nn
import glob
import random
from pathlib import Path
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset, random_split
from tqdm import tqdm
import time
import urllib.parse
from sklearn.metrics.pairwise import cosine_similarity

import spacy
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.util import filter_spans
nlp = spacy.load("en_core_web_lg")

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

import sys
sys.path.insert(0, '../src/models/')
sys.path.insert(0, '../src/features/')
#sys.path.insert(0, '../src/visualization/')
import predict_model
from build_features import text_cleaner
#import visualize as vis

%matplotlib inline

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
model = predict_model.loadBERT("../models/", 'saved_weights_inf_FIXED_boot.pt')
sim_model = predict_model.load_simBERT()
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

CPU Success


In [20]:
def search_Duck(query):
    
    """
    Queries DuckDuckGo and returns a URL list.
    """
    
    # Get results 
    page = requests.get('https://duckduckgo.com/html/?q={0}'.format(query), 
                        headers={'user-agent': 'Descriptor/0.0.1'})
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all('a', attrs={'class':'result__a'}, href=True)
    # Init list
    links = []
    # Clean results
    for link in results:
        url = link['href']
        o = urllib.parse.urlparse(url)
        d = urllib.parse.parse_qs(o.query)
        links.append(d['uddg'][0])
    return links

def search_Bing(query):
    
    """
    Queries Bing and returns a URL list.
    """
    
    # Get results
    page = requests.get('https://www.bing.com/search?form=MOZLBR&pc=MOZI&q={0}'.format(query), 
                        headers={'user-agent': 'Descriptor/0.0.1'})
    soup = BeautifulSoup(page.content, 'html.parser')
    # Init list
    links = [] 
    # Clean results
    for i in soup.find_all('a', attrs={'h':re.compile('ID=SERP.+')}, href=True):
        link = i['href']
        if link.startswith('http') and 'microsoft' not in link and 'bing' not in link:
            links.append(link)        
    return links

def SpanPredictor(span, pred_values=False):
    
    """
    Uses a trained bert classifier to see if a span
    belongs to a species description or otherwise.
    """
         
    with torch.no_grad():
        # Tokenize input
        inputs = tokenizer(span, return_tensors="pt", truncation=True)
        # Predict class
        outputs = model(**inputs)
        # Get prediction values
        exps = torch.exp(outputs)
        # Get class
        span_class = exps.argmax(1).item()

        # Print the prediction values
        if pred_values:
            return span_class, exps[0]
        else:
            return span_class
        

def similarity_matrix(sentence_list):
    
    """
    Calculates a hidden state array per sententence based on a list of
    sentences.
    """
    
    # Initialize dictionary to store tokenized sentences
    tokens = {'input_ids': [], 'attention_mask': []}

    for sentence in sentence_list:
        # encode each sentence and append to dictionary
        new_tokens = tokenizer.encode_plus(sentence, max_length=512,
                                           truncation=True, 
                                           padding='max_length',
                                           return_tensors='pt')
        # Drop the batch dimension
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    
    # Reformat list of tensors into single tensor
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    
    # Get vectors
    hiddenstates = sim_model(**tokens)
    # Sum along first axis
    summed_hs = torch.sum(hiddenstates, 1)
    # Detach
    summed_hs_np = summed_hs.detach().numpy()
    # Get the matrix
    return cosine_similarity(summed_hs_np, summed_hs_np).round(5)
        
        
    
def VisualizeDoc(text, per_sentence=False, save=False):
    
    """
    Creates and HTML file (can be rendered in a notebook) by using the SpaCy 
    Displacy.
    
    per_sentence : Returns the visualization per sentence instead of a whole doc.
    save         : If True returns the html string to save.
    """
    
    # nlp the text
    doc = nlp(text)
    # Extract the sents
    sentences = [i for i in doc.sents]
    # Init color map
    cmap = cm.get_cmap('Spectral')
    # Init color dict
    colors = {}
    # Init option dict
    options = {"ents": [],
               "colors": colors,
               "distance": 75}
    # Init matcher
    matcher = PhraseMatcher(nlp.vocab)
    # Loop over the sentences
    for idx, sentence in enumerate(sentences):
        
        # Get the prediction values    
        prediction = SpanPredictor(str(sentence), pred_values=True)[1][1].numpy().item()
        
        # String ID            
        #text = '#{0} - {1:.2f}'.format(idx, prediction)
        text = f'{prediction:.3f}'
        # Add the patterns        
        pattern = nlp(str(sentence))
        matcher.add(text, None, pattern)

        # Colorize the strings
        if prediction > .5:
            colors[text] = matplotlib.colors.rgb2hex(cmap(prediction))
        else:
            colors[text] = matplotlib.colors.rgb2hex(cmap(prediction)) #+ '90'
        # Add the new ENTS to the doc
        options["ents"].append(text)

    # Match the enitities in the doc
    matches = matcher(doc)
    # Reset the current ENTS
    doc.ents = ()
    # Loop over the matches
    for match_id, start, end in matches:
        # Add the sentencen as a ENT
        span = Span(doc, start, end, label=match_id)
        #doc.ents = filter_spans(doc.ents)
        try:
            doc.ents = list(doc.ents) + [span]
        except:
            continue
            
    # Set title
    #doc.user_data["title"] = "Description Predictor"
    sentence_spans = list(doc.sents)
    
    if save and per_sentence:
        html = displacy.render(sentence_spans, style='ent', options=options, page=True, jupyter=False, minify=False)
        return html
    elif save and not per_sentence:
        html = displacy.render(doc, style='ent', options=options, page=True, jupyter=False, minify=False)
        return html
    elif not save and per_sentence:
        displacy.render(sentence_spans, style='ent', options=options)
    elif not save and not per_sentence:
        displacy.render(doc, style='ent', options=options)

In [5]:
# Create lists
plants_dict = pickle.load(open('../data/processed/train_dataPOWO.pkl', 'rb'))
birds_dict = pickle.load(open('../data/processed/descriptions_web_birds_bow.pkl', 'rb'))

plants_list = [keys for keys, values in plants_dict.items()]
birds_list = [keys for keys, values in birds_dict.items()]

plants_list_r = random.sample(plants_list, len(birds_list))

In [6]:
species_list = plants_list_r + birds_list
random.shuffle(species_list)

In [9]:
# Init dict
data = collections.defaultdict(list)
# Init dict

# DEBUGGING
data_link = collections.defaultdict(list)
data_with_source = collections.defaultdict(list)

query = 'description'

for count, species in enumerate(tqdm(species_list[0:10])):
#for count, species in enumerate(tqdm(plants_list[0:4])):
    
    # Empty list
    search_links = []
    # create q
    species_q = species.replace(' ', '+')
    species_q = f'"{species_q}"+{query}'
    # species_q = f'"{species_q}"+{query}'
    try:
        search_links += search_Duck(species_q)
        search_links += search_Bing(species_q)
    except:
        # Skip connection timeout
        continue
    # Drop duplicates
    search_links = list(set(search_links))
    # DEBUGGING
    data_link[species] += search_links
    
    # Loop over the URLs
    for URL in search_links:
        # Skip google archives
        if 'google' in URL:
            continue
        # PDF and TXT
        if URL.endswith('txt') or URL.endswith('pdf'):
            
            """
            Continue for now, insert the text/pdf processor here
            """
            continue
            
        try:
            #print(URL)
            page = requests.get(URL, timeout=5)
            # Skip PDF files for now
            if page.headers['Content-Type'].startswith('application/pdf'):
                
                """
                Continue for now, insert the pdf processor here
                """
                continue
                
            # Soup the result
            soup = BeautifulSoup(page.content, 'html.parser')
                
            # Skip Embedded PDF's
            if 'pdf' in soup.title.text.lower():
                continue
            
            #print(soup.title.text, species)
            # Check if species exists somewhere within title
            if bool(set(species.split()).intersection(soup.title.text.split())):
                # Get text
                dirty_text = soup.get_text(". ", strip=True)
                # Clean the soup and break into sents
                sentences = text_cleaner(dirty_text)
                # Loop over the individual sentences
                for sentence in sentences:                    
                    # Create string object
                    sentence_str = str(sentence)
                    # Check if description
                    if SpanPredictor(sentence_str):
                        data[species].append(sentence_str)
                        data_with_source[species].append(tuple([sentence_str, URL]))
                            
        except: 
            continue

100%|███████████████████████████████████████████| 10/10 [03:52<00:00, 23.21s/it]


In [23]:
# Drop double sentences
for key, values in tqdm(data.items()):
    # Get similarity matrix
    matrix = similarity_matrix(values)
    # Extract indices with threshold
    indices = np.transpose((matrix>0.98).nonzero())
    # Get doubles
    if len(indices) > 1:
        doubles = [values[idx_y] for (idx_x, idx_y) in indices if idx_x != idx_y]
        # drop last half of list
        doubles = doubles[len(doubles)//2:]
        # sentences non double
        sents_nodouble = [sent for sent in values if sent not in doubles]
        # Replace data
        data[key] = sents_nodouble

100%|█████████████████████████████████████████████| 8/8 [01:56<00:00, 14.56s/it]


In [None]:
with open('data_species_descriptions.pkl', 'wb') as f:
    pickle.dump(data, f)

# Visualization

In [None]:
URL = 'http://db.worldagroforestry.org//species/properties/Enterolobium_cyclocarpum' 
#URL = 'http://powo.science.kew.org/taxon/757855-1'
#URL = 'https://birdsoftheworld.org/bow/species/gargan/cur/introduction'
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, 'html.parser')
dirty_text = soup.get_text(". ", strip=True)
text = str(text_cleaner(dirty_text, per_sent=False))
VisualizeDoc(text, per_sentence=False)

In [None]:
URL = 'http://www.llifle.com/Encyclopedia/TREES/Family/Apocynaceae/12217/Pachypodium_lealii' 
#URL = 'http://powo.science.kew.org/taxon/757855-1'
#URL = 'https://birdsoftheworld.org/bow/species/gargan/cur/introduction'
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, 'html.parser')
text = str(text_cleaner(soup, per_sent=False))
VisualizeDoc(text, per_sentence=False)

In [27]:
doc = """
Ceiba pentandra, is a lofty tropical deciduous tree with a very straight buttressed trunk up to 3 m in diameter that usually grows to an average of 18-20 meters, with old trees up to 65-70 meters in very favourable wet tropical weather and is said to be the largest tree of the West African region and occurs throughout. It produces rose-coloured or white flowers followed by a capsule which, when ripe, contains white fibres like cotton. Its trunk bears spikes to deter attacks by animals. Kapok is the most used common name for the tree and may also refer to the cotton-like fluff obtained from its seed pods. The tree is also known as the Java cotton, Java kapok, silk-cotton or ceiba both of this names may also refer to Bombax ceiba.
Ceiba pentandra, is quite easily grown from seed and is planted in parks and on roadsides as an avenue and shade tree. In built-up areas it will prove to be a troublesome one as the roots effect forceful entry into cracks in buildings, roads, drains, etc., and pass through or under and disturb foundations. It grows best in subtropical climate and heavy rainfalls but fairly drought-resistant too. 
Trunk, very straight, bole up to 35 m tall, usually cylindrical, 2-3 m in diameter, usually with large plank-like buttresses up to 3(-8) m high extending 1-2 m from the bole and with more or less horizontal main branches and often bracketed below to the stem. Branches usually in whorls of 3. The trunk and many of the larger branches are often crowded with large conical thorns 1-1.5 cm long, at least when young; bark smooth pale grey; young branches glabrous or pubescent.
"""
sents = ' '.join(text_cleaner(doc))

In [28]:
html = VisualizeDoc(sents, per_sentence=True, save=False)

In [None]:
output_path = Path("europeanrobin.html")
output_path.open("w", encoding="utf-8").write(html)