# Extract plant species name from an OCRd seedlist page (Göttingen University Botanical Gardens, 1970)

In [1]:
import json
import csv
import time
import pandas as pd
from openai import OpenAI

In [2]:
# Enter API key
client = OpenAI(api_key='')

In [3]:
# Don't truncate display
pd.set_option('display.max_colwidth', None) 

In [4]:
# Seedlist page
files = ['../data/raw/ocr/GOET_1970_7.txt']

In [5]:
# Prefixes for output filenames
file_prefixes = ['GOET']

In [6]:
# Create assistant
assistant = client.beta.assistants.create(
    name="Seedlist GPT4 species only",
    instructions='''
You are an expert botanist with deep knowledge of Latin names in binomial nomenclature of plant species.
You have knowledge of plant family names, subspecies, varieties and forms.
 
The species name in binomial nomenclature consists of two parts: 
the genus name starting with a capital letter, 
followed by the epithet name which normally starts with a small letter. 

This is often followed by a subspecies name (starting with "subsp. " or "ssp. "), 
or variety name (starting with "var."), or form (starting with "f.") and the author name(s). 
Sometimes this is followed by a synonym (starting with a syn.).

Some species names are also followed by a cultivar name (almost always within with single quotation marks 
or with the abbreviation "CV" if the cultivar has no formal name. 

The author name is often simply "L.", for "Linnaeus". For other authors it is often an abbreviation 
describing the author name.

Thus the species name consists of the genus, epithet, subspecies name (if present), variety name (if present), 
form name (if present), cultivar name (if present), author name(s) (if present) and synonym (if present).  
    ''',
    tools=[{"type": "file_search"}],
    model="gpt-4-0125-preview",
    temperature=0,
)

In [7]:
# Instructions for the assistant to carry out
assistant_message = '''
This file is a page from a seedlist of plants from a botanical garden. 
Determine if there are any Latin names of plant species present in this text. 
Find each and every plant species entry present in the text.
Do not skip or miss any entry.
 
If any plant species names are present in this text, find, for each plant species present in the text, 
the species name in Latin, consisting of the genus, epithet, subspecies name (if present), 
variety name (if present), form name (if present), cultivar name (if present), author name(s) (if present), 
and synonym (if present).  

Return the result for each plant species in valid JSON object format, with the single key "species".
Provide the complete results as a list of valid JSON objects.
If there is no information about plant species on this page, print an empty JSON object, like this: []
Print only the complete list of valid JSON objects, and no other code, text, or explanation.
'''

In [8]:
%%time

# Extract species name

for i in range(0, len(files)):
    # Run thrice to get an idea of the consistency of outputs
    for run_num in range(0,3):
        
        print(f'Starting run {run_num} ...')
        
        # Select a file
        filename = files[i]
        print(f'Processing file {filename} ...')

        # Upload the file
        file = client.files.create(
          file=open(filename, 'rb'),
          purpose='assistants'
        )

        # Create a thread to carry out the task
        thread = client.beta.threads.create(
          messages=[
            {
              "role": "user",
              "content": assistant_message,
              "attachments": [
                {
                  "file_id": file.id,
                  "tools": [{"type": "file_search"}]
                }
              ]
            }
          ]
        )

        # Run the thread
        run = client.beta.threads.runs.create_and_poll(
          thread_id=thread.id,
          assistant_id=assistant.id,
        )

        # Print run status after completion
        print("Run completed with status: " + run.status)

        # Save the data if run was successful
        if run.status == "completed":

            messages = client.beta.threads.messages.list(thread_id=thread.id)

            raw_data = messages.data[0].content[0].text.value
            json_data_string = raw_data.replace('json\n','').replace('```','')
            data = json.loads(json_data_string)

            df = pd.DataFrame(data)
            df = df.drop_duplicates()
            #df = df.sort_values(by=['species'])

            df.to_csv('../data/processed/assistants/validation/' + file_prefixes[i] + f'_temp_0_run_{run_num}.csv', index=False)

            print(f'Finished run {run_num}.')
    print(f'Finished processing file {filename}.\n\n')
    
# Delete the assistant    
client.beta.assistants.delete(assistant.id)    

Starting run 0 ...
Processing file ../data/raw/ocr/GOET_1970_7.txt ...
Run completed with status: completed
Finished run 0.
Starting run 1 ...
Processing file ../data/raw/ocr/GOET_1970_7.txt ...
Run completed with status: completed
Finished run 1.
Starting run 2 ...
Processing file ../data/raw/ocr/GOET_1970_7.txt ...
Run completed with status: completed
Finished run 2.
Finished processing file ../data/raw/ocr/GOET_1970_7.txt.


CPU times: user 374 ms, sys: 24.2 ms, total: 398 ms
Wall time: 1min 29s


AssistantDeleted(id='asst_bPpql6OHbyQhcXfHHVNN8KbV', deleted=True, object='assistant.deleted')

In [9]:
# Results from 3 runs 

df0 = pd.read_csv('../data/processed/assistants/validation/GOET_temp_0_run_0.csv').T
df1 = pd.read_csv('../data/processed/assistants/validation/GOET_temp_0_run_1.csv').T
df2 = pd.read_csv('../data/processed/assistants/validation/GOET_temp_0_run_2.csv').T

df = pd.concat([df0, df1, df2], ignore_index=True)

# Display extracted species names

In [10]:
df.T

Unnamed: 0,0,1,2
0,Chenopodium capitatum (L.) Asch.,Chenopodium capitatum (L.) Asch.,Chenopodium capitatum (L.) Asch.
1,Chenopodium ficifolium Sm.,Chenopodium ficifolium Sm.,Chenopodium ficifolium Sm.
2,Chenopodium foliosum Asch.,Chenopodium foliosum Asch.,Chenopodium foliosum Asch.
3,Chenopodium opulifolium Schrad.,Chenopodium opulifolium Schrad.,Chenopodium opulifolium Schrad.
4,Chenopodium schraderianum Schult.,Chenopodium schraderianum Schult.,Chenopodium schraderianum Schult.
5,Chenopodium urbicum L.,Chenopodium urbicum L.,Chenopodium urbicum L.
6,Corispermum leptopterum (Asch.) Iljin,Corispermum leptopterum (Asch.) Iljin,Corispermum leptopterum (Asch.) Iljin
7,Kochia scoparia (L.) Schrad.,Kochia scoparia (L.) Schrad.,Kochia scoparia (L.) Schrad.
8,Cistus crispus L.,Cistus crispus L.,Cistus crispus L.
9,Cistus hirsutus Lam.,Cistus hirsutus Lam.,Cistus hirsutus Lam.
