In [1]:
import numpy as np
import pandas as pd
import torch
import pickle
import torch.nn as nn
import glob
import transformers
from bs4 import BeautifulSoup
import requests
import re
import time
from collections import defaultdict
import pdfplumber
from tqdm import tqdm
import collections
from selenium import webdriver
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset

import sys

sys.path.insert(0, '../src/models/')
import predict_model

## Initialize Model

In [2]:
# Load BERT
model = predict_model.loadBERT("../models/", 'model_weights_splitted_reducednegatives.pt')
# Load the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CPU Success


In [3]:
def SpanPredictor(span, pred_values=False, threshold=False):
      
    """
    Uses a trained bert classifier to see if a span
    belongs to a species description or otherwise.
    """

    # Clean text
    TextCleaner = [
        '\(\d+.+?Close\n\t\n\)',
        '\[\d*\]',
        '\([^)]*\)',
        '<[^<]+>',
    ]
    
    for Cleaner in TextCleaner:
        span = re.sub(Cleaner, '', span, flags=re.DOTALL)
    
    with torch.no_grad():
        # Tokenize input
        inputs = tokenizer(span, return_tensors="pt", truncation=True)
        # Predict class
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        # Get prediction values
        exps = torch.exp(outputs)
        # Get class
        span_class = exps.argmax(1).item()

        # Print the prediction values
        if pred_values:
            return span_class, exps[0]
        else:
            return span_class    

## Crawler

In [None]:
string = """
The European has a black bill and orange breast.
"""

SpanPredictor(string, pred_values=True)

### Web part

#### LLIFLE

In [None]:
# init index list
tree_links_index = []
# Extract index pages
for i in range(1, 8):
    tree_links_index.append('http://www.llifle.com/Encyclopedia/TREES/Species/all/{0}/100/'.format(i))

# Init empty list
tree_links = []

for index_pages in tqdm(tree_links_index):
    # Extract XML
    URL = index_pages
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    # Extract links incomplete
    tree_links_half = soup.find_all('a')

    # Complete the links
    tree_links_temp = ['http://www.llifle.com' + pages.get('href') for pages in tree_links_half
                           if pages.get('href') != None 
                           if pages.get('href').startswith('/Encyclopedia/TREES/Family/')]
    # Add to all trees
    tree_links += tree_links_temp

In [None]:
# Init empty dict
data = collections.defaultdict(list)

# Loop over URLS
for tree_link in tqdm(tree_links):
    # Get URL
    URL = tree_link
    # Get Page
    page = requests.get(URL)
    # Structure page
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get species name
    species = soup.title.text.replace('\n', '')
    # List page 
    page_list = soup.getText().split('\n')
    # Clean the page
    page_list = [spans for spans in page_list if spans != '']
    # Get prediction with BERT
    predictions = [SpanPredictor(span) for span in page_list]
    # Extract data that match description
    descriptions = [span for span, pred in zip(page_list, predictions) if pred == 1]
    # If data found add to dict
    if descriptions:
        # Add data
        data[species].append(descriptions)
    else:
        continue
        
with open('../data/processed/descriptions_trees_llifle.pkl', 'wb') as f:
    pickle.dump(data, f)

#### POWO

In [None]:
# Init links
powo_links = []

# Init driver
browser = webdriver.Safari()

time.sleep(0.5)

# Loop over search pages
for i in tqdm(range(1, 2573)):
    # create URL base
    #page_url = 'https://www.ipni.org/?perPage=500&page={0}&q=family%3A{1}%2Cspecies%3A*'.format(i, tree_familie)
    page_url = 'https://www.ipni.org/?perPage=500&page{0}=&q=species%3A*'.format(i)

    # open webpage
    browser.get(page_url)
    time.sleep(0.5)

    # Extract links on the page
    links = [elem.get_attribute("href") for elem in browser.find_elements_by_tag_name('a') 
                if elem.get_attribute("href") != None
                if elem.get_attribute("href").startswith('http://powo.science.kew.org/taxon')]

    if not links:
        continue
    # Append to powo links
    powo_links += links

with open('../data/processed/intermediate_powo_links.pkl', 'wb') as f:
    pickle.dump(powo_links, f)

  5%|█▋                                 | 122/2572 [1:03:01<21:17:04, 31.28s/it]

In [None]:
# Init empty dict
data_web = collections.defaultdict(list)

# loop over URLs
for URL in tqdm(powo_links):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get title
    species = soup.title.text.split(' | ')[0]
    # Check spans
    for span in soup.find_all('dd'):
        span = span.text.strip()
        # If true append
        if SpanPredictor(span):
            data_web[species].append(span)
            
with open('../data/processed/description_web_powo.pkl', 'wb') as f:
    pickle.dump(powo_links, f)

In [None]:
'''
# Get tree families
URL = 'https://en.wikipedia.org/wiki/List_of_trees_and_shrubs_by_taxonomic_family'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

# Find all wikiparts
wiki_links = soup.find_all('a')
# Create links 
tree_families = [pages.get('href') for pages in wiki_links 
                       if pages.get('href') != None 
                       if pages.get('href').startswith('/wiki/')
                       if pages.get('href').endswith('eae')]

# Drop duplicates
tree_families = list(set(tree_families))
# Clean list
tree_families = [trees.strip('/wiki/') for trees in tree_families]

# Init links
ipni_links = []

# Init driver
browser = webdriver.Safari()

time.sleep(0.5)

for tree_familie in tqdm(tree_families):
    # Loop over search pages
    for i in range(1, 2):
        # create URL base
        #page_url = 'https://www.ipni.org/?perPage=500&page={0}&q=family%3A{1}%2Cspecies%3A*'.format(i, tree_familie)
        page_url = 'https://www.ipni.org/?perPage=500&page{0}=&q=species%3A*'.format(i)
        
        # open webpage
        browser.get(page_url)
        
        time.sleep(0.5)
        
        # Extract links on the page
        links = [elem.get_attribute("href") for elem in browser.find_elements_by_tag_name('a') 
                    if elem.get_attribute("href") != None 
                    if elem.get_attribute("href").startswith('https://www.ipni.org/n/')]
        
        if not links:
            break
        # Append to ipni links
        ipni_links += links


browser.close()

with open('../data/processed/intermediate_ipni_links_trees.pkl', 'wb') as f:
    pickle.dump(ipni_links, f)
    
ipni_links = pickle.load(open('../data/processed/intermediate_ipni_links_trees.pkl', 'rb'))

# Extract the indices
ipni_index = [link.strip('https://www.ipni.org/n/') for link in ipni_links]
# Create powo links
powo_links = ['http://powo.science.kew.org/taxon/urn:lsid:ipni.org:names:' + index for index in ipni_index]


# Init driver
browser = webdriver.Safari()

# Loop over the links
for powo_link in tqdm(powo_links):
    # Navigate to page
    browser.get(powo_link)
    
    time.sleep(0.1)
    
    # Get title page
    page_title = browser.title
    # Create species
    species = page_title.split(' | ')[0]
    #print(species)
    # Get text page and clean it
    page_text = browser.find_element_by_xpath("/html/body").text
    # Clean
    text_list = page_text.split('\n')
    text_list = [text.strip() for text in text_list 
                         if text.strip() != '' 
                         if len(text.strip().split(' ')) > 1]
    # Get prediction with BERT
    predictions = [SpanPredictor(span) for span in text_list]
    # Extract data that match description
    descriptions = [span for span, pred in zip(text_list, predictions) if pred == 1]
    # If data found add to dict
    if descriptions:
        # Add data
        data[species].append(descriptions)
    else:
        continue
            
browser.close()   
with open('../data/processed/descriptions_plants_powo.pkl', 'wb') as f:
    pickle.dump(data, f)    
'''