In [1]:
import numpy as np
import pandas as pd
import torch
import pickle
import torch.nn as nn
import glob
import transformers
from bs4 import BeautifulSoup
import requests
import re
import time
import pdfplumber
from tqdm import tqdm
import collections
from selenium import webdriver
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset

## Initialize Model

In [2]:
# specify device
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
# Load the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
class BERT(nn.Module):
    def __init__(self, bert):
        
        super(BERT, self).__init__()
        
        # Distil Bert model
        self.bert = bert
        ## Additional layers
        # Dropout layer
        self.dropout = nn.Dropout(0.1)
        # Relu activation function
        self.relu =  nn.ReLU()
        # Dense layer 1
        self.fc1 = nn.Linear(768, 512)
        # Dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512, 2)
        # Softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

        #pass the inputs to the model BERT  
        cls_hs = self.bert(sent_id, attention_mask=mask)
        hidden_state = cls_hs[0]
        pooler = hidden_state[:, 0]
        
        # dense layer 1        
        x = self.fc1(pooler)
        # ReLU activation
        x = self.relu(x)
        # Drop out
        x = self.dropout(x)
        # dense layer 2
        x = self.fc2(x)
        # apply softmax activation
        x = self.softmax(x)

        return x

In [5]:
model = BERT(bert)
# push the model to GPU
model = model.to(device)

# Load trained model (colab)
try:
    try:
        model_save_name = 'model_weights.pt'
        path = F"/content/gdrive/My Drive/{model_save_name}"
        model.load_state_dict(torch.load(path))
        print('Google Success')

    except:
        model_save_name = 'model_weights.pt'
        path = "../models/" + model_save_name
        model.load_state_dict(torch.load(path, 
                                         map_location=torch.device('cpu')))
        print('Local Success')
except:
    print('No pretrained model found.')
    
model.eval()

Local Success


BERT(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, 

## Crawler

In [6]:
def SpanPredictor(span, pred_values=False, threshold=False):
    
    """
    Uses a trained bert classifier to see if a span
    belongs to a species description or otherwise.
    """
    
    # Clean text
    TextCleaner = [
        '\(\d+.+?Close\n\t\n\)',
        '\[\d*\]',
        '\([^)]*\)',
        '<[^<]+>',
    ]
    
    for Cleaner in TextCleaner:
        span = re.sub(Cleaner, '', span, flags=re.DOTALL)
    
    with torch.no_grad():
        # Tokenize input
        inputs = tokenizer(span, return_tensors="pt", truncation=True)
        # Predict class
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        # Get prediction values
        exps = torch.exp(outputs)
        # Get class
        span_class = exps.argmax(1).item()

        # Print the prediction values
        if pred_values:
            return span_class, exps[0]
        else:
            return span_class    

In [7]:
string = """The European robin has an orange bill and black wings.
"""

SpanPredictor(string, pred_values=True)

(0, tensor([0.8258, 0.1742]))

### Web part

#### LLIFLE

In [None]:
# init index list
tree_links_index = []
# Extract index pages
for i in range(1, 8):
    tree_links_index.append('http://www.llifle.com/Encyclopedia/TREES/Species/all/{0}/100/'.format(i))

# Init empty list
tree_links = []

for index_pages in tqdm(tree_links_index):
    # Extract XML
    URL = index_pages
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    # Extract links incomplete
    tree_links_half = soup.find_all('a')

    # Complete the links
    tree_links_temp = ['http://www.llifle.com' + pages.get('href') for pages in tree_links_half
                           if pages.get('href') != None 
                           if pages.get('href').startswith('/Encyclopedia/TREES/Family/')]
    # Add to all trees
    tree_links += tree_links_temp

In [None]:
# Init empty dict
data = collections.defaultdict(list)

# Loop over URLS
for tree_link in tqdm(tree_links):
    # Get URL
    URL = tree_link
    # Get Page
    page = requests.get(URL)
    # Structure page
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get species name
    species = soup.title.text.replace('\n', '')
    # List page 
    page_list = soup.getText().split('\n')
    # Clean the page
    page_list = [spans for spans in page_list if spans != '']
    # Get prediction with BERT
    predictions = [SpanPredictor(span) for span in page_list]
    # Extract data that match description
    descriptions = [span for span, pred in zip(page_list, predictions) if pred == 1]
    # If data found add to dict
    if descriptions:
        # Add data
        data[species] = descriptions
    else:
        continue
        
with open('../data/processed/descriptions_trees_llifle.pkl', 'wb') as f:
    pickle.dump(data, f)

#### POWO

In [8]:
# Get tree families
URL = 'https://en.wikipedia.org/wiki/List_of_trees_and_shrubs_by_taxonomic_family'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

# Find all wikiparts
wiki_links = soup.find_all('a')
# Create links 
tree_families = [pages.get('href') for pages in wiki_links 
                       if pages.get('href') != None 
                       if pages.get('href').startswith('/wiki/')
                       if pages.get('href').endswith('eae')]

# Drop duplicates
tree_families = list(set(tree_families))
# Clean list
tree_families = [trees.strip('/wiki/') for trees in tree_families]

In [21]:
# Init links
ipni_links = []

# Init driver
browser = webdriver.Safari()

time.sleep(0.5)

for tree_familie in tqdm(tree_families):
    # Loop over search pages
    for i in range(1, 100):
        # create URL base
        page_url = 'https://www.ipni.org/?perPage=500&page={0}&q=family%3A{1}%2Cspecies%3A*'.format(i, tree_familie)
        # open webpage
        browser.get(page_url)
        
        time.sleep(0.5)
        
        # Extract links on the page
        links = [elem.get_attribute("href") for elem in browser.find_elements_by_tag_name('a') 
                    if elem.get_attribute("href") != None 
                    if elem.get_attribute("href").startswith('https://www.ipni.org/n/')]
        
        if not links:
            break
        # Append to ipni links
        ipni_links += links

browser.close()

with open('../data/processed/intermediate_ipni_links_trees.pkl', 'wb') as f:
    pickle.dump(ipni_links, f)

'\n# Init links\nipni_links = []\n\n# Init driver\nbrowser = webdriver.Safari()\n\ntime.sleep(0.5)\n\nfor tree_familie in tqdm(tree_families):\n    # Loop over search pages\n    for i in range(1, 100):\n        # create URL base\n        page_url = \'https://www.ipni.org/?perPage=500&page={0}&q=family%3A{1}%2Cspecies%3A*\'.format(i, tree_familie)\n        # open webpage\n        browser.get(page_url)\n        \n        time.sleep(0.5)\n        \n        # Extract links on the page\n        links = [elem.get_attribute("href") for elem in browser.find_elements_by_tag_name(\'a\') \n                    if elem.get_attribute("href") != None \n                    if elem.get_attribute("href").startswith(\'https://www.ipni.org/n/\')]\n        \n        if not links:\n            break\n        # Append to ipni links\n        ipni_links += links\n\nbrowser.close()\n\nwith open(\'../data/processed/intermediate_ipni_links_trees.pkl\', \'wb\') as f:\n    pickle.dump(ipni_links, f)\n'

In [30]:
# Init empty dict
data = collections.defaultdict(list)

# Extract the indices
ipni_index = [link.strip('https://www.ipni.org/n/') for link in ipni_links]
# Create powo links
powo_links = ['http://powo.science.kew.org/taxon/urn:lsid:ipni.org:names:' + index for index in ipni_index]

# Init driver
browser = webdriver.Safari()

# Loop over the links
for powo_link in tqdm(powo_links[301100:301200]):
    # Navigate to page
    browser.get(powo_link)
    # Get title page
    page_title = browser.title
    # Create species
    species = page_title.split(' ')[0:2]
    species = ' '.join(species)
    # Get text page and clean it
    page_text = browser.find_element_by_xpath("/html/body").text
    # Clean
    text_list = page_text.split('\n')
    text_list = [text.strip() for text in text_list 
                         if text.strip() != '' 
                         if len(text.strip().split(' ')) > 1]
    # Get prediction with BERT
    predictions = [SpanPredictor(span) for span in text_list]
    # Extract data that match description
    descriptions = [span for span, pred in zip(text_list, predictions) if pred == 1]
    # If data found add to dict
    if descriptions:
        # Add data
        data[species] = descriptions
    else:
        continue
        
    time.sleep(0.1)
    
browser.close()   
        
with open('../data/processed/descriptions_plants_powo.pkl', 'wb') as f:
    pickle.dump(data, f)    

100%|████████| 100/100 [03:11<00:00,  1.92s/it]


In [31]:
data.keys()

dict_keys(['Ceiba pentandra', 'Ceiba rubriflora', 'Ceiba speciosa'])

In [33]:
data['Ceiba rubriflora']

['Tree up to 20\xa0m tall, foliage caducous when flowered; trunk ventricose, swollen near the base, when young presenting longitudinal green stripes, provided with stout conical woody prickles to 20\xa0mm long; vegetative branches short, aculeate, with leaves clustered toward the apex; flowering branches short, diverging from larger branches at an angle of c. 90°',
 'Leaves (3 –) 5-foliolate; petiole 25 – 75\xa0mm long, slightly widened at the base, covered by whitish wax at the ends; leaflets sessile, glabrous, narrowly elliptic, oblong-elliptic, obovate-oblong or ovate-lanceolate, 3 – 3.8 × longer than wide, apex acute, c. 12°, provided with c. 4 mm long, caducous aristae, base cuneate, c. 11°, margin entire, midrib prominent beneath, inconspicuous above; two basal leaflets 20 – 35 × 6 – 10\xa0mm, shorter than the three distal leaflets, these 45 – 85 × 11 – 25\xa0mm',
 'Stipules c. 3 × 1\xa0mm, triangular, early caducous',
 'Flowers 48 – 53\xa0mm long; calyx 17 – 20 (23) × 11 – 15\xa

### PDF part

In [None]:
# Open a PDF file
with pdfplumber.open("../data/external/Trees of Peru.pdf") as pdf:
    # Get page
    page = pdf.pages[29]
    
    # Clip top and split page
    left = page.crop((0, 0.0 * float(page.height), 0.5 * float(page.width), 1.0 * float(page.height)))
    right = page.crop((0.5 * float(page.width), 0.0 * float(page.height), page.width, 1.0 * float(page.height)))
    
    # Extract text
    text = left.extract_text()
    # Split on \n
    text_list = text.split('\n')
    # Join text
    text_page = ''.join(text_list)
    # Split on points
    span_list = text_page.split('. ')
    

In [None]:
text_list

In [None]:
text1 = text.replace('\n', '')

In [None]:
text1

In [None]:
start = time.time()
predictions = [SpanPredictor(span, pred_values=True) for span in span_list]
end = time.time()
print("Time consumed in working: ", end - start)

In [None]:
predictions

In [None]:
data = [tuple([span, pred]) for span, pred in zip(span_list, predictions)]

In [None]:
data