In [1]:
import torch
import re
import os
import json
import pickle
import openai
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
import ast
from dotenv import load_dotenv
import time

In [2]:
load_dotenv()
openai.api_key = os.environ['OPENAI_API']

In [3]:
def extract_present_traits(species, df):

    s = df.loc[species]
    GT_traits = list(s.where(s == 1).dropna().index)
    empty_traits = [F"{Entity}:" for (Entity, Value) in GT_traits]

    return GT_traits, empty_traits

def create_trait_dict(multi_index):
    # Create an empty dictionary
    result_dict = {}

    # Iterate through the MultiIndex object
    for idx in multi_index:
        key = idx[0]
        value = idx[1]
        if key not in result_dict:
            result_dict[key] = []
        result_dict[key].append(value)
    
    return result_dict

def combine_words_with_capital(string):
    # remove non-alphanumeric characters
    string = re.sub(r'[^\w\s/]', '', string)
    # split the string on the slash ("/")
    parts = string.split('/')
    # combine words with capitalization for each part
    parts = [''.join(word.capitalize() for word in part.split()) for part in parts]
    # join the parts with an empty string
    return ''.join(parts)

### Trait Dicts

In [4]:
folder_traits = "../../../data/OpenAI/Traits/"

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

### DataFrames

In [5]:
root = "../../../data/OpenAI/DataFrames/"

file = "DF_Andrei.csv"
df_Andrei = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Andrei_species = list(df_Andrei.index)


file = "DF_Daniel.csv"
df_Daniel = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')\
    .dropna()
df_Daniel_species = list(df_Daniel.index)

file = "DF_Pierre.csv"
df_Pierre = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Pierre_species = list(df_Pierre.index)

In [10]:
# Define the path to the directory where the prompts and results will be saved
folder_prompts = "../../../data/OpenAI/PromptsResults/ZeroShot/"

species_lst = df_Andrei.index
trait_dict = create_trait_dict(df_Andrei)
traits = list(trait_dict.keys())

# Loop over species
for idx, species in enumerate(species_lst[0:]):
   
    # Replace spaces in the species name with underscores
    folder_species = species.replace(' ', '_')

    # Try to create a directory for the prompts for the species
    try:
        os.makedirs(F"{folder_prompts}{folder_species}")
    except FileExistsError:
        pass

    for trait in (pbar := tqdm(traits[0:], leave=False, position=0)):
        pbar.set_description(f"{idx}: {species}")

        # Get the options for the trait
        options = trait_dict[trait]

        # Create the question and options for the ChatGPT prompt
        question = F"For the species '{species}' which of the following options is applicable for the trait '{trait}'" 
        text_helper = F"Return 'NaN' if none of the options are applicable. Please return your answer as a Python list."
        user_content = F"{question}\n{options}\n{text_helper}"

        # Combine the words in the trait name with capital letters and use this as the file name
        file_name = combine_words_with_capital(trait)
        
        # Check if file is already there (OpenAI Outage)
        if os.path.exists(F"{folder_prompts}{folder_species}/{file_name}.json"):
            continue

        # # Create the messages to send to the ChatGPT API
        messages = [
            {"role": "user", "content": user_content}
            ]
        # Call the ChatGPT API to generate a completion for the prompt
        completion = openai.ChatCompletion.create(
            model = "gpt-3.5-turbo",
            messages = messages,
        )

        # Lower pressure on API?
        time.sleep(2)

        # Save the completion to a JSON file with the file name in the directory
        with open(F"{folder_prompts}{folder_species}/{file_name}.json", 'w') as fp:
            json.dump(completion, fp)
            

                                                                        

In [None]:
print(user_content)