# Extraction of plant species names from seedlists in PDF format

In [1]:
import json
import pandas as pd
import re
import os
import anthropic
from pypdf import PdfReader

In [4]:
# Enter API key
client = anthropic.Anthropic(
    api_key="",
)

In [3]:
# Don't truncate display
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_rows', None)

In [4]:
# Input seedlist file path
input_path = "../data/raw/WU/WU_7.pdf"

# Output path to store extracted species names in csv format
output_path = '../data/processed/claude/pdf'

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
# Extract filename
filename_with_extension = input_path.split('/')[-1]
filename = filename_with_extension.split('.')[0]

In [6]:
# Instructions for Claude 
instructions = '''
You are an expert botanist with deep knowledge of Latin names in binomial nomenclature of plant species.
You have knowledge of plant family names, subspecies, varieties and forms.
 
The species name in binomial nomenclature consists of two parts: 
the genus name starting with a capital letter, 
followed by the epithet name which normally starts with a small letter. 

This is often followed by a subspecies name (starting with "subsp. " or "ssp. "), 
or variety name (starting with "var."), or form (starting with "f.") and the author name(s). 
Sometimes this is followed by a synonym (starting with a syn.).

Some species names are also followed by a cultivar name (almost always within with single quotation marks 
or with the abbreviation "CV" if the cultivar has no formal name. 

The author name is often simply "L.", for "Linnaeus". For other authors it is often an abbreviation 
describing the author name.

Thus the species name consists of the genus, epithet, subspecies name (if present), variety name (if present), 
form name (if present), cultivar name (if present), author name(s) (if present) and synonym (if present).  
                                                         
You are given the text from a page from a seedlist of plants from a botanical garden. 
Determine if there are any Latin names of plant species present in this text. 
Find each and every plant species entry present in the text.
Do not skip or miss any entry.
 
If any plant species names are present in this text, find, for each plant species present in the text, 
the species name in Latin, consisting of the genus, epithet, subspecies name (if present), 
variety name (if present), form name (if present), cultivar name (if present), author name(s) (if present), 
and synonym (if present).  

Return the result for each plant species in valid JSON object format, with the single key "species".
Provide the complete results as a list of valid JSON objects.
If there is no information about plant species on this page, print an empty list, like this: []
Print only the complete list of valid JSON objects, and no other code, text, or explanation.
'''

In [7]:
# Read seedlist text
reader = PdfReader(input_path)
seedlist_text = reader.pages[0].extract_text()

In [8]:
# Text of seedlist
seedlist_text

'Bestell - \nNr. /  \nOrder  \nNr. Name  \n  Herkunft u. Art \nder Diasporen /  \norigin  and type  of \ndiaspores          IPEN  Aufsammlungs -\ndaten / \ncollecting data  \n \n Asteraceae (cont.)  \n30  Centaurea jacea L.        WS     AT-0-WU-0030521    42 \n31  -- jacea subsp . angustifolia (DC.) Gremli     WS     AT-0-WU-0031475    117 \n32  -- pseudophrygia C. A. Mey.       WS     AT-0-WU-0027271    125 \n33  -- scabiosa L.         WS     AT-0-WU-0030447    37 \n34  -- scabiosa L.         WS     AT-0-WU-0031441    95 \n35  Cirsium canum (L.) All.        WS     AT-0-WU-0031471    121 \n36  -- eriophorum (L.) Scop.        WKS     AT-0-WU-0007975    125 \n37  -- erisithales (Jacq.) Scop.        WS     AT-0-WU-0031498    114 \n38  -- pannonicum (L. f.) Link        WS     AT-0-WU-0030486    37 \n39  -- spinosissimum (L.) Scop.        WS     AT-0-WU-0030577    53 \n40  Crepis pyrenaica (L.) Greuter       WS     AT-0-WU-0030527    45 \n41  -- pyrenaica (L.) Greuter        WS     AT-0-WU

In [9]:
# Create prompt
prompt = instructions + ' Seedlist text: ' + seedlist_text

In [10]:
# Set up function for API call
def get_completion(prompt):
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        temperature=0.0,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return message.content[0].text

In [11]:
%%time

# Number of runs
num_runs = 3

# List to store result dataframes
list_df = []

for run in range(num_runs):

    print(f'Starting run {run} ...')
    
    # Call the API
    response = get_completion(prompt)

    # Post-processing
    result = re.search(r'\[.*\]', response, re.DOTALL)
    json_data_string = result.group()

    # Read JSON data into dataframe
    data = json.loads(json_data_string)

    # Output dataframe from this run
    df = pd.DataFrame(data)

    # Save dataframe to csv file
    df.to_csv(f'{output_path}/{filename}_run_{run}.csv', index=False)

    # Append to list of dataframes
    list_df.append(df)
    
    print(f'Finished run {run}.')

Starting run 0 ...
Finished run 0.
Starting run 1 ...
Finished run 1.
Starting run 2 ...
Finished run 2.
CPU times: user 32.3 ms, sys: 4.41 ms, total: 36.7 ms
Wall time: 1min 54s


In [12]:
# Concatenate the dataframes from different runs
df_concat = pd.concat([df.T for df in list_df], ignore_index=True).T

In [13]:
# Display the results from all runs
df_concat

Unnamed: 0,0,1,2
0,Centaurea jacea L.,Centaurea jacea L.,Centaurea jacea L.
1,Centaurea jacea subsp. angustifolia (DC.) Gremli,Centaurea jacea subsp. angustifolia (DC.) Gremli,Centaurea jacea subsp. angustifolia (DC.) Gremli
2,Centaurea pseudophrygia C. A. Mey.,Centaurea pseudophrygia C. A. Mey.,Centaurea pseudophrygia C. A. Mey.
3,Centaurea scabiosa L.,Centaurea scabiosa L.,Centaurea scabiosa L.
4,Centaurea scabiosa L.,Centaurea scabiosa L.,Centaurea scabiosa L.
5,Cirsium canum (L.) All.,Cirsium canum (L.) All.,Cirsium canum (L.) All.
6,Cirsium eriophorum (L.) Scop.,Cirsium eriophorum (L.) Scop.,Cirsium eriophorum (L.) Scop.
7,Cirsium erisithales (Jacq.) Scop.,Cirsium erisithales (Jacq.) Scop.,Cirsium erisithales (Jacq.) Scop.
8,Cirsium pannonicum (L. f.) Link,Cirsium pannonicum (L. f.) Link,Cirsium pannonicum (L. f.) Link
9,Cirsium spinosissimum (L.) Scop.,Cirsium spinosissimum (L.) Scop.,Cirsium spinosissimum (L.) Scop.
