In [9]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import glob
import transformers
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import requests
import re
import random
import time
from pathlib import Path
from pylab import *
import matplotlib
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm import tqdm
import collections
from statistics import mean
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset

import spacy
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.util import filter_spans
nlp = spacy.load("en_core_web_sm")

import sys
sys.path.insert(0, '../../src/models/')
sys.path.insert(0, '../../src/features/')
import predict_model
#from build_features import random_text_splitter as split_text

# Load BERT
model = predict_model.loadBERT("../../models/", 'saved_weights_inf_FIXED_boot.pt')
# Load the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

%matplotlib inline

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CPU Success


In [2]:
def SpanPredictor(span, pred_values=False):
    
    """
    Uses a trained bert classifier to see if a span
    belongs to a species description or otherwise.
    """
         
    with torch.no_grad():
        # Tokenize input
        inputs = tokenizer(span, return_tensors="pt", truncation=True)
        # Predict class
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        # Get prediction values
        exps = torch.exp(outputs)
        # Get class
        span_class = exps.argmax(1).item()

        # Print the prediction values
        if pred_values:
            return span_class, exps[0]
        else:
            return span_class    

In [32]:
## REQUESTS
# Get URL
URL = 'http://db.worldagroforestry.org//species/properties/Enterolobium_cyclocarpum'
#URL = 'https://birdsoftheworld.org/bow/species/barbro1/cur/appearance'
page = requests.get(URL, timeout=5)
# Soup the result
soup = BeautifulSoup(page.content, 'html.parser')

## CLEANING
# Get the cleaned strings
text_list = [text for text in soup.stripped_strings]
# Init
text_list_cleaned = []
# Loop over current list and clean more
for idx, text in enumerate(text_list):
    # Replace rubbish
    text = text.replace('\n', "")
    text = text.replace('\r', "")
    # Get Chapter titles (MIGHT NEEDS ADJUSTMENST)
    try:
        if len(text.split()) > 8:
            text_list_cleaned.append(text)
        if len(text_list[idx+1].split()) > 8:
            text_list_cleaned.append('Chapter - ' + text + '. ')
    except:
        continue
# Create string
cleaned_text = ''.join(text_list_cleaned)

## SPACY
# nlp the text
doc = nlp(cleaned_text)
# Extract the sents
sentences = [i for i in doc.sents]
# Init color map
cmap = cm.get_cmap('Spectral')
# Init color dict
colors = {}
# Init option dict
options = {"ents": [],
           "colors": colors}
# Init matcher
matcher = PhraseMatcher(nlp.vocab)
# Loop over the sentences
for idx, sentence in enumerate(sentences):
    # String ID            
    text = '{0}'.format(idx)
    # Add the patterns        
    pattern = nlp(str(sentence))
    matcher.add(text, None, pattern)
    # Get the prediction values    
    prediction = SpanPredictor(str(sentence), pred_values=True)[1][1].numpy().item()
    # Colorize the strings
    if str(sentence).startswith('Chapter'):
        colors[text] = '#80808090'
    elif prediction > .5:
        colors[text] = matplotlib.colors.rgb2hex(cmap(prediction))
    else:
        colors[text] = matplotlib.colors.rgb2hex(cmap(prediction)) + '50'
    # Add the new ENTS to the doc
    options["ents"].append(text)

# Match the enitities in the doc
matches = matcher(doc)
# Reset the current ENTS
doc.ents = ()
# Loop over the matches
for match_id, start, end in matches:
    # Add the sentencen as a ENT
    span = Span(doc, start, end, label=match_id)
    #doc.ents = filter_spans(doc.ents)
    try:
        doc.ents = list(doc.ents) + [span]
    except:
        continue
    
doc.user_data["title"] = "DescriptionPredictor"

sentence_spans = list(doc.sents)

displacy.render(sentence_spans, style='ent', options=options)

In [9]:
sentence_spans = list(doc.sents)

In [11]:
#displacy.render(sentence_spans, style='ent', options=options)

In [6]:
html = displacy.render(doc, style='ent', options=options, page=True, jupyter=False, minify=False)
output_path = Path("wholedoc.html")
output_path.open("w", encoding="utf-8").write(html)

26676

In [7]:
def DescriptionResults(URL, per_sentence=True, save=False):
    
    """
    Creates and HTML file (can be rendered in a notebook) by using the SpaCy 
    Displacy.
    
    per_sentence: Returns the visualization per sentence instead of a whole doc.
    save:         If True returns the html string to save.
    
    
    """
    
    ## REQUESTS
    page = requests.get(URL, timeout=5)
    # Soup the result
    soup = BeautifulSoup(page.content, 'html.parser')

    ## CLEANING
    # Get the cleaned strings
    text_list = [text for text in soup.stripped_strings]
    #text_list = list(set(text_list))
    # Init
    text_list_cleaned = []
    # Loop over current list and clean more
    for idx, text in enumerate(text_list):
        # Replace rubbish
        text = text.replace('\n', "")
        text = text.replace('\r', "")
        text = text.replace('...', "")
        text_list_cleaned.append(text)
        
        '''
        # Get Chapter titles (NEEDS ADJUSTMENST)
        try:
            if len(text.split()) > 8:
                text_list_cleaned.append(text)
            if len(text_list[idx+1].split()) > 8:
                text_list_cleaned.append('Chapter - ' + text + '. ')
        except:
            continue
        '''
        
    # Create string
    cleaned_text = '. '.join(text_list_cleaned)

    ## SPACY
    # nlp the text
    doc = nlp(cleaned_text)
    # Extract the sents
    sentences = [i for i in doc.sents]
    # Init color map
    cmap = cm.get_cmap('Spectral')
    # Init color dict
    colors = {}
    # Init option dict
    options = {"ents": [],
               "colors": colors}
    # Init matcher
    matcher = PhraseMatcher(nlp.vocab)
    # Loop over the sentences
    for idx, sentence in enumerate(sentences):
        
        # Get the prediction values    
        prediction = SpanPredictor(str(sentence), pred_values=True)[1][1].numpy().item()
        
        # String ID            
        #text = '#{0} - {1:.2f}'.format(idx, prediction)
        text = f'{prediction:.3f}'
        # Add the patterns        
        pattern = nlp(str(sentence))
        matcher.add(text, None, pattern)

        # Colorize the strings
        if str(sentence).startswith('Chapter'):
            colors[text] = '#80808090'
        elif prediction > .5:
            colors[text] = matplotlib.colors.rgb2hex(cmap(prediction))
        else:
            colors[text] = matplotlib.colors.rgb2hex(cmap(prediction)) + '60'
        # Add the new ENTS to the doc
        options["ents"].append(text)

    # Match the enitities in the doc
    matches = matcher(doc)
    # Reset the current ENTS
    doc.ents = ()
    # Loop over the matches
    for match_id, start, end in matches:
        # Add the sentencen as a ENT
        span = Span(doc, start, end, label=match_id)
        #doc.ents = filter_spans(doc.ents)
        try:
            doc.ents = list(doc.ents) + [span]
        except:
            continue
            
    # Set title
    doc.user_data["title"] = "Description Predictor"
    
    if save:
        return displacy.render(doc, style='ent', options=options)
    
    if per_sentence:
        sentence_spans = list(doc.sents)
        displacy.render(sentence_spans, style='ent', options=options)
    else:
        displacy.render(doc, style='ent', options=options)

In [8]:
URL = 'http://db.worldagroforestry.org//species/properties/Enterolobium_cyclocarpum' 
DescriptionResults(URL, per_sentence=False)

In [60]:
def long_strings(string):
    return string is not None and len(string) < 10

only_long_strings = SoupStrainer(string=long_strings)

In [61]:
## REQUESTS
page = requests.get(URL, timeout=5)
# Soup the result
soup = BeautifulSoup(page.content, 'html.parser', parse_only=only_long_strings)

## CLEANING
# Get the cleaned strings
text_list = [text for text in soup.stripped_strings]
# Init
text_list_cleaned = []
# Loop over current list and clean more
for idx, text in enumerate(text_list):
    # Replace rubbish
    text = text.replace('\n', "")
    text = text.replace('\r', "")
    text = text.replace('...', "")
    text_list_cleaned.append(text)

In [78]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

only_a_tags = SoupStrainer("a")

only_tags_with_id_link2 = SoupStrainer(id="link2")

def is_short_string(string):
    return string is not None and len(string) < 10

only_short_strings = SoupStrainer(string=is_short_string)


soup = BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings)

In [79]:
soup

