In [1]:
import torch
import re
import os
import os.path
import json
import pickle
import openai
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from dotenv import load_dotenv

### Dotenv

In [2]:
load_dotenv()
openai.api_key = os.environ['OPENAI_API']

### DataFrames

In [3]:
root = "../../../data/OpenAI/DataFrames/"

file = "DF_Andrei.csv"
df_Andrei = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Andrei_species = list(df_Andrei.index)


file = "DF_Daniel.csv"
df_Daniel = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')\
    .dropna()
df_Daniel_species = list(df_Daniel.index)

file = "DF_Pierre.csv"
df_Pierre = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Pierre_species = list(df_Pierre.index)

### Trait Dicts

In [4]:
folder_traits = "../../../data/OpenAI/Traits/"

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

### Functions

In [15]:
def combine_words_with_capital(string):
    # remove non-alphanumeric characters
    string = re.sub(r'[^\w\s/]', '', string)
    # split the string on the slash ("/")
    parts = string.split('/')
    # combine words with capitalization for each part
    parts = [''.join(word.capitalize() for word in part.split()) for part in parts]
    # join the parts with an empty string
    return ''.join(parts)

### Text Data
#### Caribbean

In [6]:
paragraph_folder = "../../../data/OpenAI/DescriptionSnippets/Paragraphs/"
sentence_folder = "../../../data/OpenAI/DescriptionSnippets/Sentences/"

caribbean_jsons_paras = glob.glob(F"{paragraph_folder}c*")
caribbean_jsons_sents = glob.glob(F"{sentence_folder}c*")

caribbean_jsons_paras.sort()
caribbean_jsons_sents.sort()

In [7]:
caribbean_jsons_paras[0]

'../../../data/OpenAI/DescriptionSnippets/Paragraphs/caribbean_Amyris_ignea_descriptions_paragraphs.json'

In [16]:
# Define the path to the directory where the prompts and results will be saved
folder_prompts = "../../../data/OpenAI/PromptsResults/"

# Loop over each JSON file in the list caribbean_jsons_paras
for idx, json_file in enumerate(caribbean_jsons_paras[0:1]):

    try:
        # Try to read in the JSON file
        with open(json_file, 'r') as f:
            caribbean_species_paragraph = json.load(f)
        
        # Get the name of the species from the keys of the dictionary in the JSON file
        species = list(caribbean_species_paragraph.keys())[0]
    except:
        # If the JSON file cannot be read in, get the name of the species from the file name
        species = json_file[62:-29].replace('_', ' ')
        continue
    
    # Join the paragraphs for the species into a single text string
    text = " ".join(caribbean_species_paragraph[species])

    # Replace spaces in the species name with underscores
    folder_species = species.replace(' ', '_')

    # Try to create a directory for the prompts for the species
    try:
        os.makedirs(F"{folder_prompts}{folder_species}")
    except FileExistsError:
        pass

    # Loop over each trait and trait options in the caribbean_traits_dict dictionary
    for trait, trait_options in (pbar := tqdm(caribbean_traits_dict.items(), leave=False, position=0)):
        pbar.set_description(f"{idx}: {species}")

        # Create the question and options for the ChatGPT prompt
        question = F"Which of the following values correctly describe(s) the '{trait}' trait mentioned in the text? If none of the following values apply, please select 'None of the above'. If you find something not mentioned in the list, select 'Other:' and fill in your findings. Select all values you can find." 
        example = F"\nPlease return the answers as a Python list like the example I provided:\nExample format:\n['tree', 'Other: palm-like']"

        question = question + example
        options = trait_options + ["Other: description of trait not mentioned in the list."]
        options = F"\nThe options with the '{trait}':\n{options}."
        user_content = F"{question} {options}"

        # print(user_content)

        # Combine the words in the trait name with capital letters and use this as the file name
        file_name = combine_words_with_capital(trait)
        # Check if file is already there (OpenAI Outage)
        if os.path.exists(F"{folder_prompts}{folder_species}/{file_name}.json"):
            continue

        # Create the messages to send to the ChatGPT API
        messages = [
            {"role": "assistant", "content": text},
            {"role": "user", "content": user_content}
            ]
        # Call the ChatGPT API to generate a completion for the prompt
        completion = openai.ChatCompletion.create(
            model = "gpt-3.5-turbo",
            messages = messages,
        )


        # Save the completion to a JSON file with the file name in the species directory
        with open(F"{folder_prompts}{folder_species}/{file_name}.json", 'w') as fp:
            json.dump(completion, fp)
            


                                                       

Life form
LifeForm
Leaf position
LeafPosition
Leaf composition
LeafComposition
Leaf shape
LeafShape
Leaf margin
LeafMargin
Leaf upper side
LeafUpperSide
Leaf lower side
LeafLowerSide
Leaf glands
LeafGlands
Leaf rachis
LeafRachis
Thorns/spines
ThornsSpines
Stipules
Stipules
Inflorescence type
InflorescenceType
Sepals / calyx shape
SepalsCalyxShape
Sepals / calyx numer
SepalsCalyxNumer
Petals / corolla shape
PetalsCorollaShape
Petals / corolla number
PetalsCorollaNumber
Petals / corolla colour
PetalsCorollaColour
Stamen shape
StamenShape
Stamen number
StamenNumber
Fruit type
FruitType
Fruit shape
FruitShape
Fruit colour
FruitColour
Aril colour
ArilColour
Seed colour
SeedColour




### Testing

In [11]:
species

'Amyris ignea'

In [12]:
text

'Front. Microb. In small scales, some families can show distinct distribution patterns, as that found for Lecythidaceae in French Guyana [32]. Microb. Microb. Opin. Front. Hop flowers are densely covered by glandular trichomes, specialized structures that secrete secondary metabolites into epidermal outgrowths3. Mol. Cell. Microb. Front. 4 Altmetric'

In [22]:
caribbean_traits_dict

trait = list(caribbean_traits_dict.keys())[1]
trait_options = caribbean_traits_dict[trait]

In [23]:
trait

'Leaf position'

In [24]:
trait_options

['alternate',
 'alternate, opposite',
 'opposite',
 'opposite, whorls of 3',
 'opposite, whorls of 3, alternate']

In [25]:
question = F"Which of the following values correctly describe(s) the '{trait}' trait mentioned in the text? If none of the following values apply, please select 'None of the above'. If you find something not mentioned in the list, select 'Other:' and fill in your findings. Select all values you can find." 
example = F"\nPlease return the answers as a Python list like the example I provided:\nExample format:\n['tree', 'Other: palm-like']"

question = question + example
options = trait_options + ["Other: description of trait not mentioned in the list."]
options = F"\nThe options with the '{trait}':\n{options}."
user_content = F"{question} {options}"

In [26]:
print(text)

Front. Microb. In small scales, some families can show distinct distribution patterns, as that found for Lecythidaceae in French Guyana [32]. Microb. Microb. Opin. Front. Hop flowers are densely covered by glandular trichomes, specialized structures that secrete secondary metabolites into epidermal outgrowths3. Mol. Cell. Microb. Front. 4 Altmetric


In [27]:
print(user_content)

Which of the following values correctly describe(s) the 'Leaf position' trait mentioned in the text? If none of the following values apply, please select 'None of the above'. If you find something not mentioned in the list, select 'Other:' and fill in your findings. Select all values you can find.
Please return the answers as a Python list like the example I provided:
Example format:
['tree', 'Other: palm-like'] 
The options with the 'Leaf position':
['alternate', 'alternate, opposite', 'opposite', 'opposite, whorls of 3', 'opposite, whorls of 3, alternate', 'Other: description of trait not mentioned in the list.'].


In [28]:
# Create the messages to send to the ChatGPT API
messages = [
    {"role": "assistant", "content": text},
    {"role": "user", "content": user_content}
    ]
# Call the ChatGPT API to generate a completion for the prompt
completion = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    messages = messages,
)

In [29]:
completion

<OpenAIObject chat.completion id=chatcmpl-6w5kvpkvhYUHIAg0yqb1mwl4YNlJJ at 0x7ff2c95ac360> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "\n\nNone of the above.",
        "role": "assistant"
      }
    }
  ],
  "created": 1679303169,
  "id": "chatcmpl-6w5kvpkvhYUHIAg0yqb1mwl4YNlJJ",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 6,
    "prompt_tokens": 246,
    "total_tokens": 252
  }
}