# Extraction of plant species names from seedlists in PDF format

In [1]:
import json
import pandas as pd
import re
import os
import anthropic
from pypdf import PdfReader
from IPython.display import display, HTML

In [2]:
# Enter API key
client = anthropic.Anthropic(
    api_key="",
)

In [3]:
# Don't truncate display
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_rows', None)

In [4]:
# Input seedlist file path
input_path = "../data/raw/LI/LI_7.pdf"

# Output path to store extracted species names in csv format
output_path = '../data/processed/claude/pdf'

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
# Extract filename
filename_with_extension = input_path.split('/')[-1]
filename = filename_with_extension.split('.')[0]

In [6]:
# Instructions for Claude 
instructions = '''
You are an expert botanist with deep knowledge of Latin names in binomial nomenclature of plant species.
You have knowledge of plant family names, subspecies, varieties and forms.
 
The species name in binomial nomenclature consists of two parts: 
the genus name starting with a capital letter, 
followed by the epithet name which normally starts with a small letter. 

This is often followed by a subspecies name (starting with "subsp. " or "ssp. "), 
or variety name (starting with "var."), or form (starting with "f.") and the author name(s). 
Sometimes this is followed by a synonym (starting with a syn.).

Some species names are also followed by a cultivar name (almost always within with single quotation marks 
or with the abbreviation "CV" if the cultivar has no formal name. 

The author name is often simply "L.", for "Linnaeus". For other authors it is often an abbreviation 
describing the author name.

Thus the species name consists of the genus, epithet, subspecies name (if present), variety name (if present), 
form name (if present), cultivar name (if present), author name(s) (if present) and synonym (if present).  
                                                         
You are given the text from a page from a seedlist of plants from a botanical garden. 
Determine if there are any Latin names of plant species present in this text. 
Find each and every plant species entry present in the text.
Do not skip or miss any entry.
 
If any plant species names are present in this text, find, for each plant species present in the text, 
the species name in Latin, consisting of the genus, epithet, subspecies name (if present), 
variety name (if present), form name (if present), cultivar name (if present), author name(s) (if present), 
and synonym (if present).  

Return the result for each plant species in valid JSON object format, with the single key "species".
Provide the complete results as a list of valid JSON objects.
If there is no information about plant species on this page, print an empty list, like this: []
Print only the complete list of valid JSON objects, and no other code, text, or explanation.
'''

In [7]:
# Read seedlist text
reader = PdfReader(input_path)
seedlist_text = reader.pages[0].extract_text()

In [8]:
# Text of seedlist
seedlist_text

'776Asteraceae Liatris elegans Michx. XX-0-LI-7086\n77Asteraceae Liatris punctata Hook. XX-0-LI-10164\n78Asteraceae Liatris spicata Willd. XX-0-LI-7094\n79Asteraceae Liatris spicata Willd. XX-0-LI-10268\n80Asteraceae Onopordum algeriense Pomel XX-0-LI-6994\n81Asteraceae Onopordum bracteatumBoiss. \n& Heldr.XX-0-\nSTGAL-130/1981\n82Asteraceae Osteospermum hyoseroides(DC.) \nNorl.XX-0-LI-14339\n83Asteraceae Pilosella aurantiaca(L.) F.W.\nSchultz & \nSch.Bip.XX-0-LI-11633\n84Asteraceae Scorzonerasuberosa \nssp. cariensis(Boiss.)\nD.F.\nChamb.XX-0-LI-13055\n85Asteraceae Senecio alpinus Scop. XX-0-LI-7426\n86Asteraceae Silphium perfoliatum L. XX-0-LI-10168\n87Asteraceae Solidago virgaurea L. XX-0-LI-14222\n88Asteraceae Stockesia laevis Greene XX-0-LI-7111\n89Asteraceae Tanacetum corymbosum(L.) \nSch.Bip.IT-0-SIENA-A-10\n90Asteraceae Telekia speciosa(Schreb.) \nBaumg.XX-0-LI-8899A\n91Berberidaceae Berberis koreana PALIB. XX-0-LI-7898\n92Bignoniaceae Incarvillea delavayiBureau \net FranchXX-0

In [9]:
# Create prompt
prompt = instructions + ' Seedlist text: ' + seedlist_text

In [10]:
# Set up function for API call
def get_completion(prompt):
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        temperature=0.0,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return message.content[0].text

In [11]:
%%time

# Number of runs
num_runs = 3

# List to store result dataframes
list_df = []

for run in range(num_runs):

    print(f'Starting run {run} ...')
    
    # Call the API
    response = get_completion(prompt)

    # Post-processing
    result = re.search(r'\[.*\]', response, re.DOTALL)
    json_data_string = result.group()

    # Read JSON data into dataframe
    data = json.loads(json_data_string)

    # Output dataframe from this run
    df = pd.DataFrame(data)

    # Save dataframe to csv file
    df.to_csv(f'{output_path}/{filename}_run_{run}.csv', index=False)

    # Append to list of dataframes
    list_df.append(df)
    
    print(f'Finished run {run}.')

Starting run 0 ...
Finished run 0.
Starting run 1 ...
Finished run 1.
Starting run 2 ...
Finished run 2.
CPU times: user 31.1 ms, sys: 10 ms, total: 41.1 ms
Wall time: 57.3 s


In [12]:
# Concatenate the dataframes from different runs
df_concat = pd.concat([df.T for df in list_df], ignore_index=True).T

In [13]:
# Display the results from all runs
df_concat

Unnamed: 0,0,1,2
0,Liatris elegans Michx.,Liatris elegans Michx.,Liatris elegans Michx.
1,Liatris punctata Hook.,Liatris punctata Hook.,Liatris punctata Hook.
2,Liatris spicata Willd.,Liatris spicata Willd.,Liatris spicata Willd.
3,Liatris spicata Willd.,Liatris spicata Willd.,Liatris spicata Willd.
4,Onopordum algeriense Pomel,Onopordum algeriense Pomel,Onopordum algeriense Pomel
5,Onopordum bracteatum Boiss. & Heldr.,Onopordum bracteatum Boiss. & Heldr.,Onopordum bracteatum Boiss. & Heldr.
6,Osteospermum hyoseroides (DC.) Norl.,Osteospermum hyoseroides (DC.) Norl.,Osteospermum hyoseroides (DC.) Norl.
7,Pilosella aurantiaca (L.) F.W. Schultz & Sch.Bip.,Pilosella aurantiaca (L.) F.W. Schultz & Sch.Bip.,Pilosella aurantiaca (L.) F.W. Schultz & Sch.Bip.
8,Scorzonera suberosa ssp. cariensis (Boiss.) D.F. Chamb.,Scorzonera suberosa ssp. cariensis (Boiss.) D.F. Chamb.,Scorzonera suberosa ssp. cariensis (Boiss.) D.F. Chamb.
9,Senecio alpinus Scop.,Senecio alpinus Scop.,Senecio alpinus Scop.
