In [2]:
import numpy as np
import pickle
import glob
from bs4 import BeautifulSoup
import re
import time
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
import transformers
import torch
import pdfplumber
from tqdm import tqdm
import collections
from transformers import DistilBertTokenizer

import sys

sys.path.insert(0, '../src/models/')
import predict_model

In [3]:
# Load BERT
model = predict_model.loadBERT("../models/", 'model_weights_splitted_reducednegatives.pt')
# Load the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CPU Success


In [4]:
def SpanPredictor(span, pred_values=False, threshold=False):
      
    """
    Uses a trained bert classifier to see if a span
    belongs to a species description or otherwise.
    """
    
    # Clean text
    TextCleaner = [
        '\(\d+.+?Close\n\t\n\)',
        '\[\d*\]',
        '\([^)]*\)',
        '<[^<]+>',
    ]
    
    for Cleaner in TextCleaner:
        span = re.sub(Cleaner, '', span, flags=re.DOTALL)
    
    with torch.no_grad():
        # Tokenize input
        inputs = tokenizer(span, return_tensors="pt", truncation=True)
        # Predict class
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        # Get prediction values
        exps = torch.exp(outputs)
        # Get class
        span_class = exps.argmax(1).item()

        # Print the prediction values
        if pred_values:
            return span_class, exps[0]
        else:
            return span_class    

In [10]:
string = """
The European has a black bill and orange breast.
"""
SpanPredictor(string, pred_values=True)

(0, tensor([0.9602, 0.0398]))

## Gentry Wood Plants

In [None]:
# Init regex pattern
pattern = '\(\d+\sspp[^)]*\)'
# Init dict
data_pdf = collections.defaultdict(list)

# Open a PDF file
with pdfplumber.open("../data/external/Gentry_woodyplants.pdf") as pdf:
    
    # Skip first pages
    #for i in tqdm(range(4, len(pdf.pages) -1)):
    for i in tqdm(range(4, 200)):
        # Get page
        page = pdf.pages[i]

        # Clip top and split page
        left = page.crop((0, 0.0 * float(page.height), 0.5 * float(page.width), 1.0 * float(page.height)))
        right = page.crop((0.5 * float(page.width), 0.0 * float(page.height), page.width, 1.0 * float(page.height)))

        combined = [left, right]
        for part in combined:

            # Extract text
            text = page.extract_text()
            # Split on \n
            text_list = text.split('\n')
            # Join text
            text_page = ''.join(text_list)
            # Search for the patterns and index
            split_index = [m.start(0) for m in re.finditer(pattern, text_page)]
            # Insert zero for species
            split_index.insert(0, 0)
            # Split on the found indices
            text_page_indices = [text_page[i:j].strip() for i,j in zip(split_index, split_index[1:] + [None])]
            # Extract species
            species_list = []
            for text_blocks in text_page_indices[:-1]:
                _, *_, species = text_blocks.split()
                species_list.append(species)
            # Clean the text
            text_cleaned = [re.sub(pattern, '', span) for span in text_page_indices[1:]]
            text_cleaned = [span.lstrip('- ').lstrip(' - ') for span in text_cleaned]
            try:
                # Remove species
                text_cleaned_last = text_cleaned[-1]
                text_cleaned = [span.replace(species, '') for span, species in zip(text_cleaned[:-1], species_list[1:])]
                text_cleaned += [text_cleaned_last]

                for span, species in zip(text_cleaned, species_list):
                    # Get predictions
                    if SpanPredictor(span):
                        data_pdf[species].append(span)
                    else:
                        continue
            except:
                continue

with open('../data/processed/description_pdf_Gentry_woodyplants_trees.pkl', 'wb') as f:
    pickle.dump(data_pdf, f)      

In [62]:
# Init list
book_list = []

# Open a PDF file
with pdfplumber.open("../data/external/Gentry_woodyplants.pdf") as pdf:
    
    # Skip first pages
    #for i in tqdm(range(4, len(pdf.pages) -1)):
    for i in tqdm(range(19, 21)):
        # Get page
        page = pdf.pages[i]

        # Clip top and split page
        left = page.crop((0, 0.1 * float(page.height), 0.5 * float(page.width), 1.0 * float(page.height)))
        right = page.crop((0.5 * float(page.width), 0.1 * float(page.height), page.width, 1.0 * float(page.height)))
        
        # Extract text
        text_list = [left.extract_text(), right.extract_text()]
        page_list = [left, right]
        if not text_list[0]:
            text_list[0] = ''
        if not text_list[1]:
            text_list[1] = ''
        
        # Loop over left and right.
        for text_part, page_part in zip(text_list, page_list):
            # Read the characters
            char_list = [(each_char["text"], int(each_char["top"])) for each_char in page_part.chars]
            
            # Init list
            pos_list = []            
            pos_list_with_char = []
            # Loop over characters
            for i, pos in enumerate(char_list):
                # Check if values is already there
                if pos[1] not in pos_list:
                    # Append new high values
                    pos_list.append(pos[1])
                    pos_list_with_char.append(pos)
                # Continue on known values
                else:
                    continue
                    
            # Get the difference
            pos_diff = [x[1] - pos_list_with_char[i - 1][1] for i, x in enumerate(pos_list_with_char) 
                        if (x[1] - pos_list_with_char[i - 1][1]) > 5]
            
            # Check enters
            # Init empty list
            span_list = [''] * 40
            # Add end number
            pos_diff.append(5)
            # Init list
            test_text_list = []
            # Init counter
            count = 0
            # Loop over values
            for position, text in zip(pos_diff, text_part.split('\n')):
                # If lower no enter
                if position < 20:
                    span_list[count] = span_list[count] + text
                # Else enter
                else:
                    span_list[count] = span_list[count] + text
                    # Update count
                    count += 1
                    
            # Remove empties
            span_list = [span.strip() for span in span_list if len(span.split()) > 1]
        
            # Append to booklist
            book_list += span_list

100%|█████████████████████████████████████████████| 2/2 [00:00<00:00,  2.35it/s]


In [None]:
book_list

In [None]:
#Gentry_woodyplants = pickle.load(open('../data/processed/description_pdf_Gentry_woodyplants_trees.pkl', 'rb'))

#len(Gentry_woodyplants.keys())
#Gentry_woodyplants.keys()
#Gentry_woodyplants['Agave'] 

## Trees of Peru

In [63]:
# Init list
book_list_cleaned = []
# Init counter
contents_counter = 0
# Init patterns
TextCleaner = [
    '\(\d+.+?Close\n\t\n\)',
    '\[\d*\]',
    '\([^)]*\)',
    '<[^<]+>',
]

with pdfplumber.open("../data/external/Trees of Peru.pdf") as pdf:
    
    #print(len(pdf.pages))
    
    book_list = []
    
    for i in tqdm(range(29, 31)):
        page = pdf.pages[i]
        
    
        left = page.crop((0, 0.05 * float(page.height), 0.6 * float(page.width), 0.85 * float(page.height)))
        right = page.crop((0.6 * float(page.width), 0.05 * float(page.height), page.width, 0.85 * float(page.height)))


        # Extract text
        text_left = left.extract_text()
        text_right = right.extract_text()
        text = text_left + '\n' + text_right
        text_list = text.split('\n')
        
        # Append to booklist
        book_list += text_list
        
for sentence in book_list:
    # Clean the text
    for Cleaner in TextCleaner:
        sentence = re.sub(Cleaner, '', sentence, flags=re.DOTALL)
    
    # Drop useless figures and content pages
    if sentence.startswith('Fig'):
        book_list_cleaned.append('-_-')
    elif re.match(r'Key to the genera', sentence):
        # Update counter
        contents_counter = 4
        book_list_cleaned.append('-_-')
    elif re.match(r'1\.', sentence) and contents_counter != 3:
        contents_counter = 0
        book_list_cleaned.append(sentence)     
    elif contents_counter > 0:
        # Check if still in contents page
        if re.match(r'\d+\.', sentence):
            # Update counter
            contents_counter += 4
        book_list_cleaned.append('')       
    else:
        book_list_cleaned.append(sentence)
    # Update counter   
    contents_counter -= 1
    
# Init dict
data_pdf = collections.defaultdict(list)

# Index the found families
name_index = [count for count, span in enumerate(book_list_cleaned) 
                if re.match(r'\d+\.\s*[A-z]+', span) 
                if span[0].isdigit()]

# counter
current = 0 
# Loop over the spans
for idx, span in enumerate(tqdm(book_list_cleaned)):
    # Skip no family/genus
    if idx < name_index[0]:
        continue
    try:
        # Check the index
        if idx == name_index[current]:
            # Clean the family/genus name
            name = re.findall("[[A-z]+", book_list_cleaned[idx])[0].lower().capitalize()
            
            # Update counter
            current += 1
            
        # Add if descriptions
        if SpanPredictor(span):
            data_pdf[name].append(span)
        else:
            #continue
            ### DEBUGGINg
            data_pdf[name].append(SpanPredictor(span, pred_values=True)[1])
        

    # Catch the end exeption
    except:
        if SpanPredictor(span):
            data_pdf[name].append(span)
        else:
            #continue
            ### DEBUGGINg
            data_pdf[name].append(SpanPredictor(span, pred_values=True)[1])

100%|█████████████████████████████████████████████| 2/2 [00:00<00:00,  2.02it/s]
100%|█████████████████████████████████████████| 166/166 [00:10<00:00, 15.94it/s]


In [64]:
data_pdf.keys()

dict_keys(['Palmae', 'Chelyocarpus', 'Itaya', 'Mauritia'])

In [65]:
data_pdf['Itaya'] 

[tensor([0.9835, 0.0165]),
 tensor([0.9969, 0.0031]),
 tensor([0.9919, 0.0081]),
 tensor([0.9942, 0.0058]),
 'Unarmed trees, stems solitary; leaf sheath densely woolly and longitudinally split, peti-',
 'ole long, with smooth margins, leaves palmate, 11-25, blade orbicular in outline and split',
 'into wedge-shaped induplicate segments with serrate tips; inflorescence axillary, pendulous,',
 'branched to 2orders, subtended by woolly bracts, inflorescence branches numerous; flowers',
 'solitary, bisexual; sepals 3, fused; petals 3, fused to halfway, valvate; stamens 18-24; ovary',
 tensor([0.5749, 0.4251]),
 tensor([0.9932, 0.0068]),
 tensor([0.9975, 0.0025]),
 'The genus isclose to Chelyocarpus but differs inthe split leaf sheath, partly fused sepals',
 tensor([0.6710, 0.3290]),
 tensor([0.9601, 0.0399]),
 'orbicular in outline, c2m dia., split to the base into 10-15 segments, undersurface whitish,',
 'flowers creamish white, fruit 2-2,5 cm long, greenish yellow; known only from near I

In [None]:
'''
def replace_ending(sentence):
    
    """
    Replace the ending of a string.
    """
    
    if sentence.endswith('.'):
        return sentence[:-len('.')] + ' KAAAAS'
    return sentence

pattern = r"[A-Z][A-Z]+AE"

# Init
data_pdf = collections.defaultdict(list)

with pdfplumber.open("../data/external/Trees of Peru.pdf") as pdf:
    
    #print(len(pdf.pages))
    
    book_list_cleaned = []
    
    for i in tqdm(range(27, 40)):
        page = pdf.pages[i]
        
    
        left = page.crop((0, 0.05 * float(page.height), 0.6 * float(page.width), 0.85 * float(page.height)))
        right = page.crop((0.6 * float(page.width), 0.05 * float(page.height), page.width, 0.85 * float(page.height)))


        # Extract text
        text_left = left.extract_text()
        text_right = right.extract_text()
        text = text_left + '\n' + text_right
        
        # Clean the text
        text_newline_replaced = text.replace('\n', ' \n ')
        # Rejoin multilines
        text_joined_multilines = text_newline_replaced.replace('- \n ', '')
        # Split into list
        text_list = text_joined_multilines.split(' \n ')
        # Clean list
        text_list_cleaned = [line for line in text_list 
                             if not line.startswith('Fig.')]
                             #if not len(re.split('- |, ', line)) == 2
                             #if len(line.split(' ')) > 1]

        # Replace the ending to get spans
        text_list_endings_replaced = [replace_ending(line) for line in text_list_cleaned]
        # Rejoin for spans
        text_cleaned = ' '.join(text_list_endings_replaced).split(' KAAAAS')
    
        # Remove contents
        book_list_cleaned += [span.strip() for span in text_cleaned if span.count('.') < 10]
        
    # Index the found families
    family_index = [count for count, span in enumerate(book_list_cleaned) if re.findall(pattern, span)]

    # counter
    current = 0 
    # Loop over the spans
    for idx, span in enumerate(tqdm(book_list_cleaned)):
        # Skip no family
        if idx < family_index[0]:
            continue
        try:
            # Check the index
            if idx == family_index[current]:
                # Clean the family name
                family_name = re.findall(pattern, book_list_cleaned[idx])[0].lower().capitalize()
                # Update counter
                current += 1
            # Add if descriptions
            #if SpanPredictor(span):
            #    data_pdf[family_name].append(span)

            ##### TESTING
            data_pdf[family_name].append(tuple([span, SpanPredictor(span, pred_values=True)]))
        # Catch the end exeption
        except:
            #if SpanPredictor(span):
            #    data_pdf[family_name].append(span)
            
            ##### TESTING
            data_pdf[family_name].append(tuple([span, SpanPredictor(span, pred_values=True)]))
'''