# Extraction of plant species names from seedlists in OCR text format

In [1]:
import json
import pandas as pd
import re
import os
import anthropic
from pypdf import PdfReader
from IPython.display import display, HTML

In [2]:
# Enter API key
client = anthropic.Anthropic(
    api_key="",
)

In [3]:
# Don't truncate display
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_rows', None)

In [4]:
# Input seedlist file path
input_path = "../data/raw/ocr/GOET_1970_7.txt"

# Output path to store extracted species names in csv format
output_path = '../data/processed/claude/ocr'

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
# Extract filename
filename_with_extension = input_path.split('/')[-1]
filename = filename_with_extension.split('.')[0]

In [6]:
# Instructions for Claude 
instructions = '''
You are an expert botanist with deep knowledge of Latin names in binomial nomenclature of plant species.
You have knowledge of plant family names, subspecies, varieties and forms.
 
The species name in binomial nomenclature consists of two parts: 
the genus name starting with a capital letter, 
followed by the epithet name which normally starts with a small letter. 

This is often followed by a subspecies name (starting with "subsp. " or "ssp. "), 
or variety name (starting with "var."), or form (starting with "f.") and the author name(s). 
Sometimes this is followed by a synonym (starting with a syn.).

Some species names are also followed by a cultivar name (almost always within with single quotation marks 
or with the abbreviation "CV" if the cultivar has no formal name. 

The author name is often simply "L.", for "Linnaeus". For other authors it is often an abbreviation 
describing the author name.

Thus the species name consists of the genus, epithet, subspecies name (if present), variety name (if present), 
form name (if present), cultivar name (if present), author name(s) (if present) and synonym (if present).  
                                                         
You are given the text from a page from a seedlist of plants from a botanical garden. 
Determine if there are any Latin names of plant species present in this text. 
Find each and every plant species entry present in the text.
Do not skip or miss any entry.
 
If any plant species names are present in this text, find, for each plant species present in the text, 
the species name in Latin, consisting of the genus, epithet, subspecies name (if present), 
variety name (if present), form name (if present), cultivar name (if present), author name(s) (if present), 
and synonym (if present).  

Return the result for each plant species in valid JSON object format, with the single key "species".
Provide the complete results as a list of valid JSON objects.
If there is no information about plant species on this page, print an empty list, like this: []
Print only the complete list of valid JSON objects, and no other code, text, or explanation.
'''

In [7]:
# Read seedlist text
with open(input_path, 'r') as file:
    seedlist_text = file.read()

In [8]:
# Text of seedlist
seedlist_text

"1d7 Chenopodium capitatum (L„) Aschers»\n168\t-\t*\tficifolium Sn.\n■169\t_\tfoliosum Asciesse.\n170\t~\topulifolium Schaad»\n171\t-\tschraderianum Schuit»\n172\t-\turbicum L»\n175 Corispermum leptopterum (Asch.) Iljin\n17^ Kochia scoparia (L») Schrad»\nCista c e.-a e\n175 Cistus crispus L»'\n176\t-\thirsutus Lam»\n177\t-\tmonspeliensis L»..,\n178 Heliantheoum apenninum (L») Mill»\n179\t-\t- var» roseum Grossor\n180\t-\tnummularium (L») Mill»\n181 Tuberaria guttata (L») Fourr»\nC n e o ra c e a e\n182 Cneorum tricoccon L»\nC o m m e 1 i na c e a _e\n185 Aneilena papuanum Warburg\n18^ Commelina communis L»\n185\t-\ttuberosa L»\n186 Palisota manniï C»B»Clarke\n187 Tinantia erecta (Jacq») Schlechtend»\n+ 188 Tradescantia crassifolia Cav».\nCompositae (Asteraceao)\n189 Achillea ageratifolia (Sibth» & Smith)\nBoiss »\n190\t-\tmacrophylla L»\n! 191\t-\tmillefolium L» (NG)\n192 Ammobium alatum R»Br.\n195 Antennaria howcllii.Greene\n19^\t-\tplantaginea R»Br»\n195 Anthemis arver.sis L»\n196\t-

In [9]:
# Create prompt
prompt = instructions + ' Seedlist text: ' + seedlist_text

In [10]:
# Set up function for API call
def get_completion(prompt):
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        temperature=0.0,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return message.content[0].text

In [11]:
%%time

# Number of runs
num_runs = 3

# List to store result dataframes
list_df = []

for run in range(num_runs):

    print(f'Starting run {run} ...')
    
    # Call the API
    response = get_completion(prompt)

    # Post-processing
    result = re.search(r'\[.*\]', response, re.DOTALL)
    json_data_string = result.group()

    # Read JSON data into dataframe
    data = json.loads(json_data_string)

    # Output dataframe from this run
    df = pd.DataFrame(data)

    # Save dataframe to csv file
    df.to_csv(f'{output_path}/{filename}_run_{run}.csv', index=False)

    # Append to list of dataframes
    list_df.append(df)
    
    print(f'Finished run {run}.')

Starting run 0 ...
Finished run 0.
Starting run 1 ...
Finished run 1.
Starting run 2 ...
Finished run 2.
CPU times: user 29.1 ms, sys: 7.77 ms, total: 36.9 ms
Wall time: 1min 25s


In [12]:
# Concatenate the dataframes from different runs
df_concat = pd.concat([df.T for df in list_df], ignore_index=True).T

In [13]:
# Display the results from all runs
df_concat

Unnamed: 0,0,1,2
0,Chenopodium capitatum (L.) Aschers.,Chenopodium capitatum (L.) Aschers.,Chenopodium capitatum (L.) Aschers.
1,Chenopodium ficifolium Sm.,Chenopodium ficifolium Sm.,Chenopodium ficifolium Sm.
2,Chenopodium foliosum Aschers.,Chenopodium foliosum Aschers.,Chenopodium foliosum Aschers.
3,Chenopodium opulifolium Schrad.,Chenopodium opulifolium Schrad.,Chenopodium opulifolium Schrad.
4,Chenopodium schraderianum Schult.,Chenopodium schraderianum Schult.,Chenopodium schraderianum Schult.
5,Chenopodium urbicum L.,Chenopodium urbicum L.,Chenopodium urbicum L.
6,Corispermum leptopterum (Asch.) Iljin,Corispermum leptopterum (Asch.) Iljin,Corispermum leptopterum (Asch.) Iljin
7,Kochia scoparia (L.) Schrad.,Kochia scoparia (L.) Schrad.,Kochia scoparia (L.) Schrad.
8,Cistus crispus L.,Cistus crispus L.,Cistus crispus L.
9,Cistus hirsutus Lam.,Cistus hirsutus Lam.,Cistus hirsutus Lam.
