In [50]:
import torch
import re
import os
import json
import pickle
import openai
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from dotenv import load_dotenv

### Dotenv

In [2]:
load_dotenv()
openai.api_key = os.environ['OPENAI_API']

### DataFrames

In [3]:
root = "../../../data/OpenAI/DataFrames/"

file = "DF_Andrei.csv"
df_Andrei = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Andrei_species = list(df_Andrei.index)


file = "DF_Daniel.csv"
df_Daniel = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')\
    .dropna()
df_Daniel_species = list(df_Daniel.index)

file = "DF_Pierre.csv"
df_Pierre = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Pierre_species = list(df_Pierre.index)

### Trait Dicts

In [37]:
folder_traits = "../../../data/OpenAI/Traits/"

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

### Paragraph Data

In [10]:
paragraph_folder = "../../../data/OpenAI/DescriptionSnippets/Paragraphs/"
sentence_folder = "../../../data/OpenAI/DescriptionSnippets/Sentences/"

caribbean_jsons = glob.glob(F"{paragraph_folder}c*")
caribbean_jsons.sort()

In [12]:
with open(caribbean_jsons[0], 'r') as f:
  caribbean_species_test = json.load(f)

In [14]:
species = list(caribbean_species_test.keys())[0]

In [39]:
def extract_present_traits(species, df):

    s = df.loc[species]
    GT_traits = list(s.where(s == 1).dropna().index)
    empty_traits = [F"{Entity}:" for (Entity, Value) in GT_traits]

    return GT_traits, empty_traits

def create_prompt(paragraph, trait_dict, present_traits, question):

    prompt = F"""
    Text:
    {paragraph}

    Python dictionary of plant traits with possible options:
    {trait_dict}

    Python trait list:
    {present_traits}

    {question}
    """

    return prompt

In [41]:
GT_traits, present_traits = extract_present_traits(species, df_Andrei)

question = "Given this Python dictionary of plant traits with possible options, can you fill in the Python trait list I provided? In case you cannot find the trait fill in 'BLANK'."

In [62]:
folder_prompts = "../../../data/OpenAI/PromptsResults/"


for idx, paragraph in (pbar := tqdm(enumerate(caribbean_species_test[species][0:]), leave=False, position=0)):
    pbar.set_description(f"{idx} {species}")

    if len(paragraph.strip()) < 500:
        continue
    prompt = create_prompt(paragraph=paragraph, 
                           trait_dict=caribbean_traits_dict, 
                           present_traits=present_traits,
                           question=question)
    
    # messages = [
    # {"role": "user", "content": prompt}
    # ]

    # # Generate a response
    # completion = openai.ChatCompletion.create(
    #     model="gpt-3.5-turbo",
    #     messages=messages,
    # )
    print(idx)
    print(prompt)

    # Create folder for prompts
    folder_prompts_species = F"{folder_prompts}{species.replace(' ', '_')}"
    try:
        os.makedirs(folder_prompts_species)
    except FileExistsError:
        # directory already exists
        pass
    
    # # Save the ChatGPT result
    # name = F"TESTING_{idx}_paragraph"
    # with open(F"{folder_prompts_species}{name}.json", 'w') as fp:
    #     json.dump(completion, fp)


                                            

4

    Text:
     Pneumatophores often rise 5–10 cm from the long horizontal roots.
Bark dark gray or brown and smooth on small trunks, becoming dark brown,
fissured, scaly, and thick.  Leaves opposite, lanceolate or narrowly
elliptical, 5–11 cm long, 2–4 cm wide, acute or blunt at tip, entire, thick,
leathery.  Fine hairs giving a grayish hue to foliage; both surfaces often with
scattered salt crystals and salty taste.  Petiole 3–15 mm long.  Spikes or
panicles headlike, upright at and near ends of twigs.  Flowers several,
crowded, sessile, 6 mm long, 10 mm across.  Calyx cup-shaped, deeply 5-lobed;
corolla tubular, hairy, white but yellowish at base, with 4 slightly unequal
spreading, rounded, or notched lobes, stamens 4, 5 mm long in notches of
corolla tube near base; pistil with imperfectly 4-celled ovary, slender style,
and 2-forked stigma. Capsule elliptical, flattened, 2.5–3 cm long, often
splitting into 2 parts. Publ

    Python dictionary of plant traits with possible options:



In [52]:
completion["usage"]

<OpenAIObject at 0x7ff4310c8770> JSON: {
  "completion_tokens": 258,
  "prompt_tokens": 1527,
  "total_tokens": 1785
}

In [55]:
print(completion["choices"][0]["message"]["content"])



['Life form:', 'Leaf position:', 'Leaf composition:', 'Leaf shape:', 'Leaf margin:', 'Leaf upper side:', 'Leaf lower side:', 'Leaf glands:', 'Thorns/spines:', 'Stipules:', 'Inflorescence type:', 'Sepals / calyx shape:', 'Sepals / calyx numer:', 'Petals / corolla shape:', 'Petals / corolla number:', 'Petals / corolla colour:', 'Stamen shape:', 'Stamen number:', 'Fruit type:', 'Fruit shape:', 'Fruit colour:', 'Seed colour:'] 

['BLANK', 'BLANK', 'BLANK', 'BLANK', 'BLANK', 'BLANK', 'BLANK', 'BLANK', 'BLANK', 'BLANK', 'BLANK', 'Sepals / calyx shape:', 'Sepals / calyx numer:', 'Petals / corolla shape:', 'Petals / corolla number:', 'Petals / corolla colour:', 'Stamen shape:', 'Stamen number:', 'Fruit type:', 'Fruit shape:', 'Fruit colour:', 'Seed colour:'] 

The trait list has not been fully completed. We need to fill in the missing traits.


In [54]:
print(prompt)


    Text:
    The heartwood is dark-brown to black, while the sapwood is yellow-brown.

    Python dictionary of plant traits with possible options:
    {'Life form': ['liana', 'tree'], 'Leaf position': ['alternate', 'alternate, opposite', 'opposite', 'opposite, whorls of 3', 'opposite, whorls of 3, alternate'], 'Leaf composition': ['3 palmate', '3-5 palmate', '3-5 pinnate', '3-5 pinnate, entire', '5-11 pinnate', '5-9 pinnate', 'bi-pinnate, 2 leaflets per jug', 'bi-pinnate, 20-40 leaflets per jug', 'bi-pinnate, 30-32 leaflets per jug', 'entire', 'pinnate, 4-6 leaflets', 'pinnate, 6-8 leaflets'], 'Leaf shape': ['elliptic', 'elliptic, elongate', 'elliptic, lanceolate', 'elliptic, obovate', 'elliptic, ovate', 'elliptic, ovate, round', 'elongate', 'elongate, elliptic, obovate', 'elongate, obovate', 'kidney-shaped, circular', 'lanceolate, elliptic', 'linear', 'linear, obovate', 'obovate', 'obovate, elliptic', 'obovate, spathulate', 'ovate', 'ovate, circular', 'ovate, elliptic', 'ovate, ell

In [48]:
df_Andrei_species = list(df_Andrei.index)

In [131]:
def extract_present_traits(species, df):

    s = df.loc[species]
    GT_traits = list(s.where(s == 1).dropna().index)
    empty_traits = [F"{Entity}:" for (Entity, Value) in GT_traits]

    return GT_traits, empty_traits

### Paragraphs

In [149]:
folder_traits = "../../../data/OpenAI/Traits/"

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

folder_descriptions = "../../../data/OpenAI/DescriptionSnippets/"

caribbean_description_paragraph_dict = pickle.load(open(F"{folder_descriptions}descriptions_paragraphs_caribbean.pkl", 'rb'))
caribbean_description_sentence_dict = pickle.load(open(F"{folder_descriptions}descriptions_sentences_caribbean.pkl", 'rb'))


### Prompting

In [97]:
paragraph_all = ' '.join(caribbean_description_paragraph_dict[df_Andrei_species[0]])
paragraph_all

'Avicennia germinans, the black mangrove,[3] is a shrub or small tree growing up to 12 meters (39 feet) in the acanthus family, Acanthaceae. The leaves often appear whitish from the salt excreted at night and on cloudy days. The heartwood is dark-brown to black, while the sapwood is yellow-brown. Leaves are simple and opposite and grow from 2 to 3 inches long. The leaf is oval and pointed, and the margins are entire. The leaves appear smooth, thick, and leathery with a dark green topside and grey to white underside. When the tree is young the bark is smooth and as it matures the bark takes on a thick and fissured texture. The flowers appear at the ends of the branches and are small, white, and fragrant with yellow centers.  Pneumatophores often rise 5–10 cm from the long horizontal roots.\r\nBark dark gray or brown and smooth on small trunks, becoming dark brown,\r\nfissured, scaly, and thick.  Leaves opposite, lanceolate or narrowly\r\nelliptical, 5–11 cm long, 2–4 cm wide, acute or b

In [134]:
GT_traits, present_traits = extract_present_traits(df_Andrei_species[0], df_Andrei)


In [151]:
question = "Given this Python dictionary of plant traits with possible options, can you fill in the Python trait list I provided? In case you cannot find the trait fill in 'BLANK'."
question

"Given this Python dictionary of plant traits with possible options, can you fill in the Python trait list I provided? In case you cannot find the trait fill in 'BLANK'."

In [152]:
prompt = F"""
Text: 
{paragraph_all}

Python dictionary of plant traits with possible options:
{caribbean_traits_dict} 

Python trait list: 
{present_traits}

{question}
"""
print(prompt)


Text: 
Avicennia germinans, the black mangrove,[3] is a shrub or small tree growing up to 12 meters (39 feet) in the acanthus family, Acanthaceae. The leaves often appear whitish from the salt excreted at night and on cloudy days. The heartwood is dark-brown to black, while the sapwood is yellow-brown. Leaves are simple and opposite and grow from 2 to 3 inches long. The leaf is oval and pointed, and the margins are entire. The leaves appear smooth, thick, and leathery with a dark green topside and grey to white underside. When the tree is young the bark is smooth and as it matures the bark takes on a thick and fissured texture. The flowers appear at the ends of the branches and are small, white, and fragrant with yellow centers.  Pneumatophores often rise 5–10 cm from the long horizontal roots.
Bark dark gray or brown and smooth on small trunks, becoming dark brown,
fissured, scaly, and thick.  Leaves opposite, lanceolate or narrowly
elliptical, 5–11 cm long, 2–4 cm wide, acute or blu

In [153]:
messages = [
    {"role": "user", "content": prompt}
    ]

# Generate a response
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=messages,
)

In [154]:
completion["usage"]

<OpenAIObject at 0x7fe469726660> JSON: {
  "completion_tokens": 164,
  "prompt_tokens": 1960,
  "total_tokens": 2124
}

In [155]:
print(completion["choices"][0]["message"]["content"])



Life form: tree
Leaf position: opposite
Leaf composition: BLANK
Leaf shape: elliptic
Leaf margin: entire
Leaf upper side: salt crystals
Leaf lower side: scales
Leaf glands: absent
Thorns/spines: absent
Stipules: absent
Inflorescence type: panicle
Sepals / calyx shape: cup-shaped
Sepals / calyx numer: 5
Petals / corolla shape: tubular
Petals / corolla number: 4
Petals / corolla colour: white
Stamen shape: in notches of corolla tube near base
Stamen number: 4
Fruit type: capsule
Fruit shape: elliptical
Fruit colour: BLANK
Seed colour: BLANK


In [156]:
GT_traits

[('Life form', 'tree'),
 ('Leaf position', 'opposite'),
 ('Leaf composition', 'entire'),
 ('Leaf shape', 'lanceolate, elliptic'),
 ('Leaf margin', 'entire'),
 ('Leaf upper side', 'salt crystals'),
 ('Leaf lower side', 'powdery'),
 ('Leaf glands', 'absent'),
 ('Thorns/spines', 'absent'),
 ('Stipules', 'absent'),
 ('Inflorescence type', 'panicle'),
 ('Sepals / calyx shape', 'free'),
 ('Sepals / calyx numer', '5'),
 ('Petals / corolla shape', 'tubular'),
 ('Petals / corolla number', '4'),
 ('Petals / corolla colour', 'white'),
 ('Stamen shape', 'longer than corolla'),
 ('Stamen number', '4'),
 ('Fruit type', 'capsule'),
 ('Fruit shape', 'flattened ovate'),
 ('Fruit colour', 'green'),
 ('Seed colour', 'green')]

In [164]:
for gpt, GT in zip(completion["choices"][0]["message"]["content"].split('\n')[2:], GT_traits):

    GT = GT[1]    
    print(F"GPT: {gpt}. \nGT: {GT}")



GPT: Life form: tree. 
GT: tree
GPT: Leaf position: opposite. 
GT: opposite
GPT: Leaf composition: BLANK. 
GT: entire
GPT: Leaf shape: elliptic. 
GT: lanceolate, elliptic
GPT: Leaf margin: entire. 
GT: entire
GPT: Leaf upper side: salt crystals. 
GT: salt crystals
GPT: Leaf lower side: scales. 
GT: powdery
GPT: Leaf glands: absent. 
GT: absent
GPT: Thorns/spines: absent. 
GT: absent
GPT: Stipules: absent. 
GT: absent
GPT: Inflorescence type: panicle. 
GT: panicle
GPT: Sepals / calyx shape: cup-shaped. 
GT: free
GPT: Sepals / calyx numer: 5. 
GT: 5
GPT: Petals / corolla shape: tubular. 
GT: tubular
GPT: Petals / corolla number: 4. 
GT: 4
GPT: Petals / corolla colour: white. 
GT: white
GPT: Stamen shape: in notches of corolla tube near base. 
GT: longer than corolla
GPT: Stamen number: 4. 
GT: 4
GPT: Fruit type: capsule. 
GT: capsule
GPT: Fruit shape: elliptical. 
GT: flattened ovate
GPT: Fruit colour: BLANK. 
GT: green
GPT: Seed colour: BLANK. 
GT: green


In [5]:
prompt_list = []

for entity, quality_value in caribbean_traits_dict.items():
    # print(entity, quality_value)

    quality_value += ["No ifnromation in the text"]

    prompt = F"The plant property {entity}, with possible values: {quality_value}"
    prompt_list.append(prompt)

In [7]:
prompt_text = ' '.join(caribbean_description_paragraph_dict['Avicennia germinans'])
prompt_text

'Avicennia germinans, the black mangrove,[3] is a shrub or small tree growing up to 12 meters (39 feet) in the acanthus family, Acanthaceae. The leaves often appear whitish from the salt excreted at night and on cloudy days. The heartwood is dark-brown to black, while the sapwood is yellow-brown. Leaves are simple and opposite and grow from 2 to 3 inches long. The leaf is oval and pointed, and the margins are entire. The leaves appear smooth, thick, and leathery with a dark green topside and grey to white underside. When the tree is young the bark is smooth and as it matures the bark takes on a thick and fissured texture. The flowers appear at the ends of the branches and are small, white, and fragrant with yellow centers.  Pneumatophores often rise 5–10 cm from the long horizontal roots.\r\nBark dark gray or brown and smooth on small trunks, becoming dark brown,\r\nfissured, scaly, and thick.  Leaves opposite, lanceolate or narrowly\r\nelliptical, 5–11 cm long, 2–4 cm wide, acute or b

In [8]:
prompt_question = "Can you extract all the semantic triples from the text?"
prompt_question

'Can you extract all the semantic triples from the text?'

In [9]:
prompt = F"""
"{prompt_text}"

{prompt_question}
"""
print(prompt)


"Avicennia germinans, the black mangrove,[3] is a shrub or small tree growing up to 12 meters (39 feet) in the acanthus family, Acanthaceae. The leaves often appear whitish from the salt excreted at night and on cloudy days. The heartwood is dark-brown to black, while the sapwood is yellow-brown. Leaves are simple and opposite and grow from 2 to 3 inches long. The leaf is oval and pointed, and the margins are entire. The leaves appear smooth, thick, and leathery with a dark green topside and grey to white underside. When the tree is young the bark is smooth and as it matures the bark takes on a thick and fissured texture. The flowers appear at the ends of the branches and are small, white, and fragrant with yellow centers.  Pneumatophores often rise 5–10 cm from the long horizontal roots.
Bark dark gray or brown and smooth on small trunks, becoming dark brown,
fissured, scaly, and thick.  Leaves opposite, lanceolate or narrowly
elliptical, 5–11 cm long, 2–4 cm wide, acute or blunt at 

In [23]:
messages = [
    {"role": "user", "content": prompt}
    ]

messages

[{'role': 'user',
  'content': '\n"Avicennia germinans, the black mangrove,[3] is a shrub or small tree growing up to 12 meters (39 feet) in the acanthus family, Acanthaceae. The leaves often appear whitish from the salt excreted at night and on cloudy days. The heartwood is dark-brown to black, while the sapwood is yellow-brown. Leaves are simple and opposite and grow from 2 to 3 inches long. The leaf is oval and pointed, and the margins are entire. The leaves appear smooth, thick, and leathery with a dark green topside and grey to white underside. When the tree is young the bark is smooth and as it matures the bark takes on a thick and fissured texture. The flowers appear at the ends of the branches and are small, white, and fragrant with yellow centers.  Pneumatophores often rise 5–10 cm from the long horizontal roots.\r\nBark dark gray or brown and smooth on small trunks, becoming dark brown,\r\nfissured, scaly, and thick.  Leaves opposite, lanceolate or narrowly\r\nelliptical, 5–1

In [24]:
# Generate a response
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=messages,
)

In [34]:
completion["usage"]

<OpenAIObject at 0x7fe479739d00> JSON: {
  "completion_tokens": 287,
  "prompt_tokens": 478,
  "total_tokens": 765
}

In [32]:
print(completion["choices"][0]["message"]["content"])



1. Plant species: Avicennia germinans
2. Family name: Acanthaceae
3. Tree height: up to 12 meters
4. Leaf characteristics: simple, opposite, oval, pointed, thick, leathery, dark green topside, grey to white underside
5. Bark color and texture: dark gray or brown, smooth on small trunks, becoming dark brown, fissured, scaly and thick
6. Flower characteristics: small, white, fragrant with yellow centers, several, crowded, sessile, tubular and hairy corolla, imperfectly 4-celled ovary, slender style, and 2-forked stigma
7. Fruit characteristics: Elliptical, flattened, 2.5–3 cm long, often splitting into 2 parts
8. Root characteristics: long horizontal roots with Pneumatophores often rising 5–10 cm
9. Leaf margin: entire
10. Leaf length: 2-3 inches
11. Salt excretion: leaves often appear whitish from the salt excreted at night and on cloudy days
12. Heartwood and sapwood color: dark-brown to black (heartwood), yellow-brown (sapwood)
13. Other features: fine hairs give a grayish hue to fo

In [37]:
prompt_list

["The plant property Life form, with possible values: ['liana', 'tree', 'No ifnromation in the text']",
 "The plant property Leaf position, with possible values: ['alternate', 'alternate, opposite', 'opposite', 'opposite, whorls of 3', 'opposite, whorls of 3, alternate', 'No ifnromation in the text']",
 "The plant property Leaf composition, with possible values: ['3 palmate', '3-5 palmate', '3-5 pinnate', '3-5 pinnate, entire', '5-11 pinnate', '5-9 pinnate', 'bi-pinnate, 2 leaflets per jug', 'bi-pinnate, 20-40 leaflets per jug', 'bi-pinnate, 30-32 leaflets per jug', 'entire', 'pinnate, 4-6 leaflets', 'pinnate, 6-8 leaflets', 'No ifnromation in the text']",
 "The plant property Leaf shape, with possible values: ['elliptic', 'elliptic, elongate', 'elliptic, lanceolate', 'elliptic, obovate', 'elliptic, ovate', 'elliptic, ovate, round', 'elongate', 'elongate, elliptic, obovate', 'elongate, obovate', 'kidney-shaped, circular', 'lanceolate, elliptic', 'linear', 'linear, obovate', 'obovate', 