In [2]:
import numpy as np
import pandas as pd
import torch
import pickle
import re
import requests
from selenium import webdriver
from IPython.display import display, HTML
from matplotlib import cm
import matplotlib
from bs4 import BeautifulSoup
import collections
from itertools import chain
from collections import Counter
import torch.nn as nn
import glob
import random
from pathlib import Path
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset, random_split
from tqdm import tqdm
import time
import urllib.parse
from sklearn.metrics.pairwise import cosine_similarity

import spacy
from spacy import displacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.util import filter_spans
nlp = spacy.load("en_core_web_lg")

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

import sys
sys.path.insert(0, '../src/models/')
sys.path.insert(0, '../src/features/')
#sys.path.insert(0, '../src/visualization/')
import predict_model
from build_features import text_cleaner
#import visualize as vis

%matplotlib inline

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Load the models and tokenizer

In [3]:
model = predict_model.loadBERT("../models/", 'saved_weights_inf_FIXED_boot_beta80.pt')
sim_model = predict_model.load_simBERT()
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

CPU Success


## Functions used in the crawler

In [4]:
def search_Duck(query):
    
    """
    Queries DuckDuckGo and returns a URL list.
    """
    
    # Get results 
    page = requests.get('https://duckduckgo.com/html/?q={0}'.format(query), 
                        headers={'user-agent': 'Descriptor/0.0.1'})
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all('a', attrs={'class':'result__a'}, href=True)
    # Init list
    links = []
    # Clean results
    for link in results:
        url = link['href']
        o = urllib.parse.urlparse(url)
        d = urllib.parse.parse_qs(o.query)
        links.append(d['uddg'][0])
    return links

def search_Bing(query):
    
    """
    Queries Bing and returns a URL list.
    """
    
    # Get results
    page = requests.get('https://www.bing.com/search?form=MOZLBR&pc=MOZI&q={0}'.format(query), 
                        headers={'user-agent': 'Descriptor/0.0.1'})
    soup = BeautifulSoup(page.content, 'html.parser')
    # Init list
    links = [] 
    # Clean results
    for i in soup.find_all('a', attrs={'h':re.compile('ID=SERP.+')}, href=True):
        link = i['href']
        if link.startswith('http') and 'microsoft' not in link and 'bing' not in link:
            links.append(link)        
    return links

def SpanPredictor(span, pred_values=False):
    
    """
    Uses a trained bert classifier to see if a span
    belongs to a species description or otherwise.
    """
         
    with torch.no_grad():
        # Tokenize input
        inputs = tokenizer(span, return_tensors="pt", truncation=True)
        # Predict class
        outputs = model(**inputs)
        # Get prediction values
        exps = torch.exp(outputs)
        # Get class
        span_class = exps.argmax(1).item()

        # Print the prediction values
        if pred_values:
            return span_class, exps[0]
        else:
            return span_class
    
def VisualizeDoc(text, per_sentence=False, save=False):
    
    """
    Creates and HTML file (can be rendered in a notebook) by using the SpaCy 
    Displacy.
    
    per_sentence : Returns the visualization per sentence instead of a whole doc.
    save         : If True returns the html string.
    """
    
    # nlp the text
    doc = nlp(text)
    # Extract the sents
    sentences = [i for i in doc.sents]
    # Init color map
    cmap = cm.get_cmap('Spectral')
    # Init color dict
    colors = {}
    # Init option dict
    options = {"ents": [],
               "colors": colors,
               "distance": 75}
    # Init matcher
    matcher = PhraseMatcher(nlp.vocab)
    # Loop over the sentences
    for idx, sentence in enumerate(sentences):
        
        # Get the prediction values    
        prediction = SpanPredictor(str(sentence), pred_values=True)[1][1].numpy().item()
        
        # String ID            
        #text = '#{0} - {1:.2f}'.format(idx, prediction)
        text = f'{prediction:.3f}'
        # Add the patterns        
        pattern = nlp(str(sentence))
        matcher.add(text, None, pattern)

        # Colorize the strings
        if prediction > .5:
            colors[text] = matplotlib.colors.rgb2hex(cmap(prediction))
        else:
            colors[text] = matplotlib.colors.rgb2hex(cmap(prediction)) #+ '90'
        # Add the new ENTS to the doc
        options["ents"].append(text)

    # Match the enitities in the doc
    matches = matcher(doc)
    # Reset the current ENTS
    doc.ents = ()
    # Loop over the matches
    for match_id, start, end in matches:
        # Add the sentencen as a ENT
        span = Span(doc, start, end, label=match_id)
        #doc.ents = filter_spans(doc.ents)
        try:
            doc.ents = list(doc.ents) + [span]
        except:
            continue
            
    # Set title
    #doc.user_data["title"] = "Description Predictor"
    sentence_spans = list(doc.sents)
    
    if save and per_sentence:
        html = displacy.render(sentence_spans, style='ent', options=options, page=True, jupyter=False, minify=False)
        return html
    elif save and not per_sentence:
        html = displacy.render(doc, style='ent', options=options, page=True, jupyter=False, minify=False)
        return html
    elif not save and per_sentence:
        displacy.render(sentence_spans, style='ent', options=options)
    elif not save and not per_sentence:
        displacy.render(doc, style='ent', options=options)
        
def colorize_prediction(sentence_list, tex=False):

    # Get prediction values
    sentence_pred = [SpanPredictor(sent, pred_values=True)[1][1].item() for sent in sentence_list]
    # Get color map
    sentence_cmap = matplotlib.cm.BuGn
    # Resample to prevent dark green
    
    template = """  <mark class="entity" style="
    background: {}; 
    padding: 0.4em 0.0em; 
    margin: 0.0em; 
    line-height: 2; 
    border-radius: 0.75em;
    ">{}    
    <span style="
    font-size: 0.8em; 
    font-weight: bold; 
    line-height: 1; 
    border-radius: 0.75em;
    text-align: justify;
    text-align-last:center;
    vertical-align: middle;
    margin-left: 0rem">
    </span>\n</mark>"""

    colored_string = ''
    
    # Tex list
    tex_colors = []
    tex_text = []
    HTML = 'HTML'
    
    # Map the values
    normalized_and_mapped = matplotlib.cm.ScalarMappable(cmap=sentence_cmap).to_rgba(sentence_pred)
    # Color overlay the values
    for idx, (sentence, color, prediction) in enumerate(zip(sentence_list, normalized_and_mapped, sentence_pred)):
        
        sentence = f'{sentence} < {prediction:.3f} >'
        color = matplotlib.colors.rgb2hex(color)
        colored_string += template.format(color, sentence)
        
        ## TEX PART
        if tex:
            tex_colors.append(f'\definecolor{{color{idx+1}}}{{{HTML}}}{{{color[1:]}}}')
            tex_text.append(f'\sethlcolor{{color{idx+1}}}\hl{{{sentence}}}')
            
    if tex:
        print('Copy paste this in the .tex file')
        print('\n'.join(tex_colors))
        print('\n'.join(tex_text))
    
    
    #display(HTML(colored_string))
    #output_path = Path("test.html")
    #output_path.open("w", encoding="utf-8").write(colored_string)
    return colored_string

## Load the data
Check which plant species have the most sentences and use these species for scraping

In [14]:
# Load POWO
#plants_list = pickle.load(open('../data/external/species_plants.pkl', 'rb'))

# Load keys
plants_dict = pickle.load(open('../data/processed/descriptions_powo_PLANTS.pkl', 'rb'))
plants_list =[key for key in plants_dict.keys()]

In [16]:
#len(plant_list)

35198

In [20]:
# Init dict
data = collections.defaultdict(list)
# Init dict

# DEBUGGING
data_link = collections.defaultdict(list)
#data_with_source = collections.defaultdict(list)

query = 'description'

for count, family in enumerate(tqdm(plants_list[10:12])):

    
    # Empty list
    search_links = []
    # create q
    species_q = species.replace(' ', '+')
    species_q = f'"{species_q}"+{query}'
    # species_q = f'"{species_q}"+{query}'
    try:
        search_links += search_Duck(species_q)
        search_links += search_Bing(species_q)
    except:
        # Skip connection timeout
        continue
    # Drop duplicates
    search_links = list(set(search_links))
    # DEBUGGING
    data_link[species] += search_links
    
    # Loop over the URLs
    for URL in search_links:
        # Skip google archives
        if 'google' in URL:
            continue
        # PDF and TXT
        if URL.endswith('txt') or URL.endswith('pdf'):
            
            """
            Continue for now, insert the text/pdf processor here
            """
            continue
            
        try:
            #print(URL)
            page = requests.get(URL, timeout=5)
            # Skip PDF files for now
            if page.headers['Content-Type'].startswith('application/pdf'):
                
                """
                Continue for now, insert the pdf processor here
                """
                continue
                
            # Soup the result
            soup = BeautifulSoup(page.content, 'html.parser')
                
            # Skip Embedded PDF's
            if 'pdf' in soup.title.text.lower():
                continue
            
            #print(soup.title.text, species)
            # Check if species exists somewhere within title
            if bool(set(species.split()).intersection(soup.title.text.split())):
                # Get text
                dirty_text = soup.get_text(". ", strip=True)
                # Clean and break into sents
                sentences = text_cleaner(dirty_text)
                # Loop over the individual sentences
                for sentence in sentences:                    
                    # Create string object
                    sentence_str = str(sentence)
                    # Check if description
                    if SpanPredictor(sentence_str):
                        data[species].append((sentence_str, URL))
                        #data_with_source[species].append(tuple([sentence_str, URL]))
                            
        except: 
            continue

100%|██████████████████████████████████████████| 2/2 [00:21<00:00, 10.65s/it]


In [27]:
[result['href'] for result in results if result['href'].startswith('https')]

['https://robertvandevlasakker.com/',
 'https://robertvandevlasakker.medium.com/',
 'https://nl.linkedin.com/in/robertvandevlasakker',
 'https://robertvandevlasakker.medium.com/list/50c8b547dc29',
 'https://robertvandevlasakker.medium.com/followers',
 'https://medium.com/nerd-for-tech/cartoonize-images-with-python-10e2a466b5fb',
 'https://www.linkedin.com/in/robertvandewalle',
 'https://creepypasta.fandom.com/wiki/Robert_the_Doll',
 'https://au.linkedin.com/in/robert-van-de-berg-90362312',
 'https://uk.linkedin.com/in/robert-van-der-meer-1b13162']

In [36]:
search_links

[]

In [None]:
#with open('../data/processed/scrapeddata_train_species_description_random_0-1000_PLANTS.pkl', 'wb') as f:
#    pickle.dump(data, f)

In [None]:
data_with_source['Dombeya shupangae']

In [None]:
data_with_source['Forpus passerinus']

In [None]:
URL = 'https://independent-travellers.com/namibia/c39_c43/41.php'
#URL = 'http://db.worldagroforestry.org//species/properties/Enterolobium_cyclocarpum'
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, 'html.parser')
dirty_text = soup.get_text(". ", strip=True)
sentences = text_cleaner(dirty_text)
colored_string = colorize_sents(sentences)
display(HTML(colored_string))