In [2]:
import json
import numpy as np
import pandas as pd
import glob
import ast
import re
from tqdm import tqdm
import os.path
import collections
from collections import defaultdict


### Open Data

In [19]:
folder_traits = "../../../data/OpenAI/Traits/"

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

traits_caribbean = list(caribbean_traits_dict.keys())

In [35]:
def extract_list_from_string(input_string):
    if 'NA' in input_string:
        return 'NA'

    # Clean up the input string by removing double parentheses
    cleaned_string = re.sub(r'\(\(', r'(', input_string)
    cleaned_string = re.sub(r'\)\)', r')', cleaned_string)

    # Find the last occurrence of '[' and ']' to extract the last list
    start_index = cleaned_string.rfind('[')
    end_index = cleaned_string.rfind(']')

    # Extract the substring between the '[' and ']'
    list_string = cleaned_string[start_index + 1:end_index]

    # Remove any newline characters and leading/trailing whitespace
    list_string = list_string.replace('\n', '').strip()

    # Convert the string to a list of tuples
    list_of_tuples = eval(list_string)

    # Return the list of tuples
    return list(list_of_tuples)

def match_trait_list(traits, tuple_lst):

    if len(traits) != len(tuple_lst):
        raise ValueError

    
    result = {trait: pred[2] for trait, pred in zip(traits, tuple_lst)}
    return result 
    

In [40]:
# Define the directory path containing the JSON files
folder_prompts = "../../../data/OpenAI/PromptsResults/ZeroShot/"

# Create a list of all JSON files in the directory
species_json_lst = glob.glob(F"{folder_prompts}*")

# Create a dictionary to store the results
prompt_results_dict = collections.defaultdict(dict)

# Iterate over each JSON file in the list
for idx, species_json in enumerate(species_json_lst):
    
    # Extract the species name from the file name and format it
    species_name = os.path.split(species_json)[1].replace('_', ' ')[0:-5]
    
    # Load the contents of the JSON file
    with open(species_json, 'r') as f:
        prompt_result = json.load(f)

    # Extract the result from the JSON object
    result = res = prompt_result['choices'][0]['message']['content']
    
    # Extract a list of traits from the result
    result_list = extract_list_from_string(result)
    
    # Match the list of traits against a list of Caribbean traits
    result_dict = match_trait_list(traits_caribbean, result_list)

    # Add the species name and matching traits to the dictionary
    prompt_results_dict[species_name] = result_dict


In [41]:
prompt_results_dict

defaultdict(dict,
            {'Avicennia germinans': {'Life form': 'plant',
              'Leaf position': 'opposite',
              'Leaf composition': 'simple',
              'Leaf shape': 'lanceolate',
              'Leaf margin': 'entire',
              'Leaf upper side': 'dark green',
              'Leaf lower side': 'light green',
              'Leaf glands': 'present',
              'Leaf rachis': 'present',
              'Thorns/spines': 'absent',
              'Stipules': 'absent',
              'Inflorescence type': 'spike',
              'Sepals / calyx shape': 'tubular',
              'Sepals / calyx numer': '5',
              'Petals / corolla shape': 'tubular',
              'Petals / corolla number': '5',
              'Petals / corolla colour': 'white',
              'Stamen shape': 'long, slender',
              'Stamen number': '8-10',
              'Fruit type': 'drupe',
              'Fruit shape': 'oval',
              'Fruit colour': 'green',
              'Aril 

### Dataframe with GT

In [5]:
root = "../../../data/OpenAI/DataFrames/"

file = "DF_Andrei.csv"
df_Andrei = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Andrei = df_Andrei.rename(columns={'Thorns/spines': 'Thorns spines', 
                                      'Sepals / calyx shape': 'Sepals calyx shape', 
                                      'Petals / corolla shape': 'Petals corolla shape',
                                      'Petals / corolla number': 'Petals corolla number',
                                      'Petals / corolla colour': 'Petals corolla colour',
                                      'Sepals / calyx numer': 'Sepals calyx numer',})
df_Andrei_species = list(df_Andrei.index)


file = "DF_Daniel.csv"
df_Daniel = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')\
    .dropna()
df_Daniel_species = list(df_Daniel.index)

file = "DF_Pierre.csv"
df_Pierre = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Pierre_species = list(df_Pierre.index)

In [6]:
def extract_GT_traits(species, df):

    s = df.loc[species]
    GT_traits = list(s.where(s == 1).dropna().index)

    return GT_traits

def extract_ALL_traits(species, df):

    s = df.loc[species]
    traits = list(s.index)

    # Create a dict
    result = {}
    for key, value in traits:
        if key in result:
            result[key].append(value)
        else:
            result[key] = [value]

    return result


### Caribbean 

In [7]:
# Create an empty DataFrame with a multi-level index based on the columns of another DataFrame
df_Andrei_ChatGPT = pd.DataFrame(index=pd.MultiIndex.from_tuples(df_Andrei.columns)).T

# Loop through each species in a dictionary of ground truth traits
for species in prompt_results_dict.keys():

    # Extract the ground truth traits for the current species from a DataFrame
    ALL_traits = extract_ALL_traits(species, df_Andrei)

    # Loop through each trait and its corresponding value in the ground truth traits
    for trait, value in ALL_traits.items():

        # print(trait, 'Correct')
        ChatGPT_result = prompt_results_dict[species][trait]

        # If NaN
        if ChatGPT_result == 'NA':
            df_Andrei_ChatGPT.loc[species, (trait)] = np.NaN
            continue

        # Find the correct values by finding the intersection between the ground truth values and the ChatGPT result
        correct_values = list(set(value) & set(ChatGPT_result))
        incorrect_values = list(set(value) - set(correct_values))
        # Set a value of 1 in the DataFrame for each correct value for the current species and trait
        for correct_value in correct_values:
            df_Andrei_ChatGPT.loc[species, (trait, correct_value)] = 1
        for incorrect_value in incorrect_values:
            df_Andrei_ChatGPT.loc[species, (trait, incorrect_value)] = 0

folder_prompt_results = "../../../data/OpenAI/PromptsAnalysesData/"

df_Andrei_ChatGPT.to_csv(F"{folder_prompt_results}caribbean_df_orig_ChatGPT.csv")
df_Andrei.loc[df_Andrei_ChatGPT.index].to_csv(F"{folder_prompt_results}caribbean_df_orig_GT.csv")

In [8]:
df_Andrei.loc[df_Andrei_ChatGPT.index]

Unnamed: 0_level_0,Life form,Life form,Leaf position,Leaf position,Leaf position,Leaf position,Leaf position,Leaf composition,Leaf composition,Leaf composition,...,Aril colour,Aril colour,Aril colour,Aril colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour
Unnamed: 0_level_1,liana,tree,alternate,"alternate, opposite",opposite,"opposite, whorls of 3","opposite, whorls of 3, alternate",3 palmate,3-5 palmate,3-5 pinnate,...,orange,red,white,yellow-geen,black,brown,green,grey,white,whitish
Laguncularia racemosa,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Conocarpus erectus,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bursera karsteniana,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Metopium brownei,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Clusia rosea,0,1,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
Quadrella odoratissima,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
Handroanthus billbergii,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
Avicennia germinans,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Maytenus versluysii,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
Cynophalla flexuosa,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [9]:
df_Andrei_ChatGPT

Unnamed: 0_level_0,Life form,Life form,Leaf position,Leaf position,Leaf position,Leaf position,Leaf position,Leaf composition,Leaf composition,Leaf composition,...,Aril colour,Aril colour,Aril colour,Aril colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour
Unnamed: 0_level_1,liana,tree,alternate,"alternate, opposite",opposite,"opposite, whorls of 3","opposite, whorls of 3, alternate",3 palmate,3-5 palmate,3-5 pinnate,...,orange,red,white,yellow-geen,black,brown,green,grey,white,whitish
Laguncularia racemosa,0.0,1.0,0.0,0.0,1.0,0.0,0.0,,,,...,,,,,0.0,1.0,0.0,0.0,0.0,1.0
Conocarpus erectus,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,0.0,1.0,0.0,1.0,0.0,0.0
Bursera karsteniana,0.0,1.0,,,,,,,,,...,,,,,,,,,,
Metopium brownei,0.0,0.0,,,,,,,,,...,,,,,0.0,1.0,0.0,0.0,0.0,0.0
Clusia rosea,0.0,0.0,,,,,,0.0,0.0,0.0,...,,,,,1.0,0.0,1.0,0.0,1.0,0.0
Quadrella odoratissima,0.0,1.0,1.0,0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,
Handroanthus billbergii,0.0,1.0,,,,,,0.0,0.0,0.0,...,,,,,,,,,,
Avicennia germinans,0.0,1.0,,,,,,0.0,0.0,0.0,...,,,,,1.0,1.0,0.0,0.0,1.0,1.0
Maytenus versluysii,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,,,,,0.0,0.0,1.0,0.0,0.0,0.0
Cynophalla flexuosa,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,
