# Extraction of plant species names from seedlists in PDF format

In [1]:
import json
import pandas as pd
import re
import os
import anthropic
from pypdf import PdfReader
from IPython.display import display, HTML

In [2]:
# Enter API key
client = anthropic.Anthropic(
    api_key="",
)

In [3]:
# Don't truncate display
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_rows', None)

In [4]:
# Input seedlist file path
input_path = "../data/raw/bgr02/bgr02_2.pdf"

# Output path to store extracted species names in csv format
output_path = '../data/processed/claude/pdf'

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
# Extract filename
filename_with_extension = input_path.split('/')[-1]
filename = filename_with_extension.split('.')[0]

In [6]:
# Instructions for Claude 
instructions = '''
You are an expert botanist with deep knowledge of Latin names in binomial nomenclature of plant species.
You have knowledge of plant family names, subspecies, varieties and forms.
 
The species name in binomial nomenclature consists of two parts: 
the genus name starting with a capital letter, 
followed by the epithet name which normally starts with a small letter. 

This is often followed by a subspecies name (starting with "subsp. " or "ssp. "), 
or variety name (starting with "var."), or form (starting with "f.") and the author name(s). 
Sometimes this is followed by a synonym (starting with a syn.).

Some species names are also followed by a cultivar name (almost always within with single quotation marks 
or with the abbreviation "CV" if the cultivar has no formal name. 

The author name is often simply "L.", for "Linnaeus". For other authors it is often an abbreviation 
describing the author name.

Thus the species name consists of the genus, epithet, subspecies name (if present), variety name (if present), 
form name (if present), cultivar name (if present), author name(s) (if present) and synonym (if present).  
                                                         
You are given the text from a page from a seedlist of plants from a botanical garden. 
Determine if there are any Latin names of plant species present in this text. 
Find each and every plant species entry present in the text.
Do not skip or miss any entry.
 
If any plant species names are present in this text, find, for each plant species present in the text, 
the species name in Latin, consisting of the genus, epithet, subspecies name (if present), 
variety name (if present), form name (if present), cultivar name (if present), author name(s) (if present), 
and synonym (if present).  

Return the result for each plant species in valid JSON object format, with the single key "species".
Provide the complete results as a list of valid JSON objects.
If there is no information about plant species on this page, print an empty list, like this: []
Print only the complete list of valid JSON objects, and no other code, text, or explanation.
'''

In [7]:
# Read seedlist text
reader = PdfReader(input_path)
seedlist_text = reader.pages[0].extract_text()

In [8]:
# Text of seedlist
seedlist_text

'PARS I    \nSEMINA  PLANTARUM  \nIN  LOCO  NATALI  LECTA  \n \nMAGNOLIOPHYTA  \nMAGNOLIOPSIDA  \n \nADOXACEAE   \n1. Viburnum opulus  L. – Vitosha Mts, near Zlatni \nmostove locality, 19. 09.2020 . \n2. Sambucus racemosa  L. – Rila Mt, near Borovetz \nresort, 13.08. 2020 . \nANACARDIACEAE  \n3. Pistacia terebinthus  L. – Manage d reserve \nIzgor yaloto Gjune , near Krichim town,  12.09.2020 . \nASTERACEAE  \n4. Inula ens ifolia L. - Nature Park Zlatni p yasatsi, near \nVarna town,  28.08.2020 . \n5. Jacobaea paludosa  (L.) G.Gaertn., B.Mey. & \nScherb. (syn. Senecio paludosus  L.) – Rodopi \nMt, Tsigov chark locality, 16.10.2019.  \n6. Tanacetum corymbosum (L.) Sch.Bip.  – Nature \nPark Zlatni pyasatsi, near Varna town,  28.08.2020 . \nBRASSICACEAE  \n7. Alliaria petiolata  (M.Bieb.) Cavara & Grande – \nnear Sotir ya village,  Stara Planina Mt. , 05.09. 2020 . \n8. Lunaria annua  L. – Managed reserve Izgor yaloto \nGjune , near Krichim town,  12.09.2020 . \nCAMPANULAC EAE  \n9. Campan

In [9]:
# Create prompt
prompt = instructions + ' Seedlist text: ' + seedlist_text

In [10]:
# Set up function for API call
def get_completion(prompt):
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        temperature=0.0,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return message.content[0].text

In [11]:
%%time

# Number of runs
num_runs = 3

# List to store result dataframes
list_df = []

for run in range(num_runs):

    print(f'Starting run {run} ...')
    
    # Call the API
    response = get_completion(prompt)

    # Post-processing
    result = re.search(r'\[.*\]', response, re.DOTALL)
    json_data_string = result.group()

    # Read JSON data into dataframe
    data = json.loads(json_data_string)

    # Output dataframe from this run
    df = pd.DataFrame(data)

    # Save dataframe to csv file
    df.to_csv(f'{output_path}/{filename}_run_{run}.csv', index=False)

    # Append to list of dataframes
    list_df.append(df)
    
    print(f'Finished run {run}.')

Starting run 0 ...
Finished run 0.
Starting run 1 ...
Finished run 1.
Starting run 2 ...
Finished run 2.
CPU times: user 33.7 ms, sys: 1.4 ms, total: 35.2 ms
Wall time: 1min 8s


In [12]:
# Concatenate the dataframes from different runs
df_concat = pd.concat([df.T for df in list_df], ignore_index=True).T

In [13]:
# Display the results from all runs
df_concat

Unnamed: 0,0,1,2
0,Viburnum opulus L.,Viburnum opulus L.,Viburnum opulus L.
1,Sambucus racemosa L.,Sambucus racemosa L.,Sambucus racemosa L.
2,Pistacia terebinthus L.,Pistacia terebinthus L.,Pistacia terebinthus L.
3,Inula ensifolia L.,Inula ensifolia L.,Inula ensifolia L.
4,"Jacobaea paludosa (L.) G.Gaertn., B.Mey. & Scherb. (syn. Senecio paludosus L.)","Jacobaea paludosa (L.) G.Gaertn., B.Mey. & Scherb. (syn. Senecio paludosus L.)","Jacobaea paludosa (L.) G.Gaertn., B.Mey. & Scherb. (syn. Senecio paludosus L.)"
5,Tanacetum corymbosum (L.) Sch.Bip.,Tanacetum corymbosum (L.) Sch.Bip.,Tanacetum corymbosum (L.) Sch.Bip.
6,Alliaria petiolata (M.Bieb.) Cavara & Grande,Alliaria petiolata (M.Bieb.) Cavara & Grande,Alliaria petiolata (M.Bieb.) Cavara & Grande
7,Lunaria annua L.,Lunaria annua L.,Lunaria annua L.
8,Campanula sibirica L.,Campanula sibirica L.,Campanula sibirica L.
9,Scabiosa argentea L.,Scabiosa argentea L.,Scabiosa argentea L.
