In [90]:
import torch
import re
import os
import os.path
import json
import pickle
import openai
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
import random
from dotenv import load_dotenv

### Dotenv

In [18]:
load_dotenv()
openai.api_key = os.environ['OPENAI_API']

### DataFrames

In [19]:
root = "../../../data/OpenAI/DataFrames/"

file = "DF_Andrei.csv"
df_Andrei = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Andrei_species = list(df_Andrei.index)


file = "DF_Daniel.csv"
df_Daniel = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')\
    .dropna()
df_Daniel_species = list(df_Daniel.index)

file = "DF_Pierre.csv"
df_Pierre = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Pierre_species = list(df_Pierre.index)

### Trait Dicts

In [20]:
folder_traits = "../../../data/OpenAI/Traits/"

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

### Functions

In [21]:
def combine_words_with_capital(string):
    # remove non-alphanumeric characters
    string = re.sub(r'[^\w\s/]', '', string)
    # split the string on the slash ("/")
    parts = string.split('/')
    # combine words with capitalization for each part
    parts = [''.join(word.capitalize() for word in part.split()) for part in parts]
    # join the parts with an empty string
    return ''.join(parts)

### Text Data
#### Caribbean

In [6]:
paragraph_folder = "../../../data/OpenAI/DescriptionSnippets/Paragraphs/"
sentence_folder = "../../../data/OpenAI/DescriptionSnippets/Sentences/"

caribbean_jsons_paras = glob.glob(F"{paragraph_folder}c*")
caribbean_jsons_sents = glob.glob(F"{sentence_folder}c*")

caribbean_jsons_paras.sort()
caribbean_jsons_sents.sort()

In [7]:
caribbean_jsons_paras[0]

'../../../data/OpenAI/DescriptionSnippets/Paragraphs/caribbean_Amyris_ignea_descriptions_paragraphs.json'

In [29]:
# Define the path to the directory where the prompts and results will be saved
folder_prompts = "../../../data/OpenAI/PromptsResults/"

choices = [
    1, 0, 
]

# Loop over each JSON file in the list caribbean_jsons_paras
for idx, json_file in enumerate(caribbean_jsons_paras[1:2]):

    try:
        # Try to read in the JSON file
        with open(json_file, 'r') as f:
            caribbean_species_paragraph = json.load(f)
        
        # Get the name of the species from the keys of the dictionary in the JSON file
        species = list(caribbean_species_paragraph.keys())[0]
    except:
        # If the JSON file cannot be read in, get the name of the species from the file name
        species = json_file[62:-29].replace('_', ' ')
        continue
    
    # Join the paragraphs for the species into a single text string
    text = " ".join(caribbean_species_paragraph[species])

    # Replace spaces in the species name with underscores
    folder_species = species.replace(' ', '_')

    # Try to create a directory for the prompts for the species
    try:
        os.makedirs(F"{folder_prompts}{folder_species}")
    except FileExistsError:
        pass

    # Loop over each trait and trait options in the caribbean_traits_dict dictionary
    for trait, trait_options in (pbar := tqdm(caribbean_traits_dict.items(), leave=False, position=0)):
        pbar.set_description(f"{idx}: {species}")

        # Create the question and options for the ChatGPT prompt


        question = F"Which of the following values correctly describe(s) the '{trait}' trait mentioned in the text? Fill in a '1', if the trait value is likely to occur based on the text, '0' if it is unlikely to occur and 'NA' if there is no information that can be used to infer the occurrence of the trait."
        format = F"\nPlease ONLY return a Python list of tuples with ALL the options/value combinations from the list with options, like this: [({trait_options[0]}, {random.choice(choices)}), (({trait_options[1]}, {random.choice(choices)}))]"
        options = F"\nThe possible values for this trait are: {trait_options}."\

        user_content = F"{question} {options} {format}"
        # print(user_content)

        # Combine the words in the trait name with capital letters and use this as the file name
        file_name = combine_words_with_capital(trait)
        # Check if file is already there (OpenAI Outage)
        if os.path.exists(F"{folder_prompts}{folder_species}/{file_name}.json"):
            continue

        # Create the messages to send to the ChatGPT API
        # messages = [
        #     {"role": "assistant", "content": text},
        #     {"role": "user", "content": user_content}
        #     ]
        # # Call the ChatGPT API to generate a completion for the prompt
        # completion = openai.ChatCompletion.create(
        #     model = "gpt-3.5-turbo",
        #     messages = messages,
        # )


        # # Save the completion to a JSON file with the file name in the species directory
        # with open(F"{folder_prompts}{folder_species}/{file_name}.json", 'w') as fp:
        #     json.dump(completion, fp)
            


                                                              

### Testing

In [33]:
species

'Avicennia germinans'

In [34]:
text

'In contrast, trees in Texas and Louisiana were often less than one meter tall. The heartwood is dark-brown to black, while the sapwood is yellow-brown. The leaves often appear whitish from the salt excreted at night and on cloudy days. Leaves are simple and opposite and grow from 2 to 3 inches long. The leaf is oval and pointed, and the margins are entire. The leaves appear smooth, thick, and leathery with a dark green topside and grey to white underside. When the tree is young the bark is smooth and as it matures the bark takes on a thick and fissured texture. The flowers appear at the ends of the branches and are small, white, and fragrant with yellow centers. 9 Altmetric Avicennia germinans (Black Mangrove) is a medium-sized, evergreen shrub or tree with a rounded, densely foliated crown of spreading branches. It bears thick, elliptic, simple and opposite leaves, 2-3 in. long (5-7 cm). Smooth and leathery, they are dark green above and silver-white underneath. Blooming year-round, 

In [169]:
caribbean_traits_dict

trait = list(caribbean_traits_dict.keys())[22]
trait_options = caribbean_traits_dict[trait]

In [170]:
trait

'Aril colour'

In [171]:
trait_options

['NA?', 'orange', 'red', 'white', 'yellow-geen']

In [172]:
choices = [
    1, 0, 
]

question = F"Which of the following values correctly describe(s) the '{trait}' trait mentioned in the text? Fill in a '1', if the trait value is likely to occur based on the text, '0' if it is unlikely to occur and 'NA' if there is no information that can be used to infer the occurrence of the trait."
format = F"\nPlease ONLY return a Python list of tuples with ALL the options/value combinations from the list with options, like this: [('{trait_options[0]}', '{random.choice(choices)}'), ('{trait_options[1]}', '{random.choice(choices)}')] or in case there is not information: [('{trait_options[0]}', 'NA'), ('{trait_options[1]}', 'NA')]"
options = F"\nThe possible values for this trait are: {trait_options}."\

user_content = F"{question} {options} {format}"

In [173]:
print(text)

In contrast, trees in Texas and Louisiana were often less than one meter tall. The heartwood is dark-brown to black, while the sapwood is yellow-brown. The leaves often appear whitish from the salt excreted at night and on cloudy days. Leaves are simple and opposite and grow from 2 to 3 inches long. The leaf is oval and pointed, and the margins are entire. The leaves appear smooth, thick, and leathery with a dark green topside and grey to white underside. When the tree is young the bark is smooth and as it matures the bark takes on a thick and fissured texture. The flowers appear at the ends of the branches and are small, white, and fragrant with yellow centers. 9 Altmetric Avicennia germinans (Black Mangrove) is a medium-sized, evergreen shrub or tree with a rounded, densely foliated crown of spreading branches. It bears thick, elliptic, simple and opposite leaves, 2-3 in. long (5-7 cm). Smooth and leathery, they are dark green above and silver-white underneath. Blooming year-round, s

In [174]:
print(F"""text:
{text}
""")
print(user_content)

text:
In contrast, trees in Texas and Louisiana were often less than one meter tall. The heartwood is dark-brown to black, while the sapwood is yellow-brown. The leaves often appear whitish from the salt excreted at night and on cloudy days. Leaves are simple and opposite and grow from 2 to 3 inches long. The leaf is oval and pointed, and the margins are entire. The leaves appear smooth, thick, and leathery with a dark green topside and grey to white underside. When the tree is young the bark is smooth and as it matures the bark takes on a thick and fissured texture. The flowers appear at the ends of the branches and are small, white, and fragrant with yellow centers. 9 Altmetric Avicennia germinans (Black Mangrove) is a medium-sized, evergreen shrub or tree with a rounded, densely foliated crown of spreading branches. It bears thick, elliptic, simple and opposite leaves, 2-3 in. long (5-7 cm). Smooth and leathery, they are dark green above and silver-white underneath. Blooming year-ro

In [175]:
# Create the messages to send to the ChatGPT API
messages = [
    {"role": "assistant", "content": text},
    {"role": "user", "content": user_content}
    ]
# Call the ChatGPT API to generate a completion for the prompt
completion = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    messages = messages,
)

In [176]:
completion

<OpenAIObject chat.completion id=chatcmpl-6wYbIrdUPWFLMMxVP3o0iu3ck6MYH at 0x7fbe20d3cd10> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "\n\n[('NA?', 'NA'), ('orange', 'NA'), ('red', '0'), ('white', 'NA'), ('yellow-green', 'NA')]",
        "role": "assistant"
      }
    }
  ],
  "created": 1679414048,
  "id": "chatcmpl-6wYbIrdUPWFLMMxVP3o0iu3ck6MYH",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 33,
    "prompt_tokens": 1185,
    "total_tokens": 1218
  }
}