In [66]:
import json
import numpy as np
import pandas as pd
import glob
import re


In [76]:
folder_traits = "../../../data/OpenAI/Traits/"

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

In [67]:
folder_prompts = "../../../data/OpenAI/PromptsResults/"

species_folders = glob.glob(F"{folder_prompts}*")
species_folders

['../../../data/OpenAI/PromptsResults/Amyris_ignea',
 '../../../data/OpenAI/PromptsResults/Avicennia_germinans',
 '../../../data/OpenAI/PromptsResults/Bursera_simaruba',
 '../../../data/OpenAI/PromptsResults/Bourreria_succulenta']

In [71]:
avicennia_json_list = glob.glob(F"{species_folders[1]}/*")
avicennia_json_list[0:4]

['../../../data/OpenAI/PromptsResults/Avicennia_germinans/PetalsCorollaShape.json',
 '../../../data/OpenAI/PromptsResults/Avicennia_germinans/PetalsCorollaColour.json',
 '../../../data/OpenAI/PromptsResults/Avicennia_germinans/SepalsCalyxShape.json',
 '../../../data/OpenAI/PromptsResults/Avicennia_germinans/LeafPosition.json']

In [72]:
prompt_results_dict = {}

for json_file in avicennia_json_list:

    # Trait name
    trait = json_file[56:-5]

    # Try to read in the JSON file
    with open(json_file, 'r') as f:
        prompt_result = json.load(f)

    prompt_results_dict[trait] = prompt_result['choices'][0]['message']['content']

In [74]:
def string_to_list(input_dict):
    """
    Converts values in a dictionary from string or list of string format to list format.
    If the value is a string, it will check if it matches the pattern "Value: <value>" or "Values: [<value1>, <value2>, ...]".
    If it matches, it will convert the value to a list with one or more items.
    If it does not match, it will leave the value as a list with one item.

    Args:
        input_dict (dict): The input dictionary.

    Returns:
        dict: A new dictionary with values in list format.
    """
    # Compile regular expression pattern to match "Value: <value>" or "Values: [<value1>, <value2>, ...]" patterns
    pattern = re.compile(r"^Value:\s*(.*)|^Values:\s*\[(.*)\]")

    # Create an empty dictionary to store the converted values
    output_dict = {}

    # Loop through the input dictionary
    for key, value in input_dict.items():

        # Use the regular expression pattern to search for a match in the value
        match = pattern.search(value)

        # If the value matches the pattern
        if match:
            # Get the matched groups (i.e. the values within the parentheses in the regular expression)
            groups = match.groups()

            # If the first group (i.e. the value in "Value: <value>") is not None
            if groups[0]:
                # Convert the value to a list with one item
                output_dict[key] = [groups[0]]

            # If the second group (i.e. the values in "Values: [<value1>, <value2>, ...]") is not None
            elif groups[1]:
                # Split the values by commas and remove any leading or trailing whitespace
                values = [x.strip() for x in groups[1].split(",")]
                # Convert the values to a list
                output_dict[key] = values

        # If the value does not match the pattern
        else:
            # Convert the value to a list with one item
            output_dict[key] = [value]

    # Return the converted dictionary
    return output_dict


def clean_dict_values(input_dict):
    """
    Converts dictionary values to lists where applicable and strips unnecessary information.
    
    If the value of a key in the dictionary is "None of the above.", it will be converted to ['None of the above'].
    If the value of a key in the dictionary is a string containing one or more values separated by commas, 
    it will be converted to a list of those values.
    
    Args:
    input_dict: A dictionary whose values need to be cleaned.
    
    Returns:
    A dictionary with cleaned values.
    """
    
    # Create a regular expression pattern to match "None of the above"
    none_pattern = re.compile(r"^None of the above\.")
    
    # Create an empty dictionary to store cleaned values
    output_dict = {}
    
    # Loop through the keys and values of the input dictionary
    for key, value in input_dict.items():
        
        # Check if the value is "None of the above"
        if none_pattern.match(value):
            output_dict[key] = ['None of the above']
        else:
            # Split the value string by commas and strip whitespace from each value
            value_list = [x.strip() for x in value.split(',')]
            output_dict[key] = value_list
            
    return output_dict


In [77]:
caribbean_traits_dict

{'Life form': ['liana', 'tree'],
 'Leaf position': ['alternate',
  'alternate, opposite',
  'opposite',
  'opposite, whorls of 3',
  'opposite, whorls of 3, alternate'],
 'Leaf composition': ['3 palmate',
  '3-5 palmate',
  '3-5 pinnate',
  '3-5 pinnate, entire',
  '5-11 pinnate',
  '5-9 pinnate',
  'bi-pinnate, 2 leaflets per jug',
  'bi-pinnate, 20-40 leaflets per jug',
  'bi-pinnate, 30-32 leaflets per jug',
  'entire',
  'pinnate, 4-6 leaflets',
  'pinnate, 6-8 leaflets'],
 'Leaf shape': ['elliptic',
  'elliptic, elongate',
  'elliptic, lanceolate',
  'elliptic, obovate',
  'elliptic, ovate',
  'elliptic, ovate, round',
  'elongate',
  'elongate, elliptic, obovate',
  'elongate, obovate',
  'kidney-shaped, circular',
  'lanceolate, elliptic',
  'linear',
  'linear, obovate',
  'obovate',
  'obovate, elliptic',
  'obovate, spathulate',
  'ovate',
  'ovate, circular',
  'ovate, elliptic',
  'ovate, elliptic, elongate',
  'ovate, heart-shaped',
  'spathulate, obovate'],
 'Leaf margin'

In [73]:
prompt_results_dict

{'PetalsCorollaShape': 'tubular',
 'PetalsCorollaColour': "\n\nThe trait 'Petals / corolla colour' is not explicitly mentioned in the text. Therefore, none of the values mentioned apply.",
 'SepalsCalyxShape': 'cup-shaped, deeply 5-lobed.',
 'LeafPosition': 'alternate, opposite',
 'LeafMargin': 'entire',
 'InflorescenceType': 'panicle',
 'SeedColour': 'None of the above. The text does not mention the seed color.',
 'LeafGlands': 'None of the above.',
 'Stipules': 'None of the above. The trait of stipules is not mentioned in the text.',
 'SepalsCalyxNumer': "'4-5' describes the 'Sepals / calyx numer' trait mentioned in the text.",
 'FruitColour': 'green',
 'LeafUpperSide': 'Dark green, shiny, glabrous. The text mentions that the upper side of the leaves are "smooth, thick, and leathery with a dark green topside".',
 'Thornsspines': 'None of the above. Thorns/spines trait is not mentioned in the text.',
 'LifeForm': 'tree',
 'LeafRachis': "None of the above. The trait 'Leaf rachis' is no

In [75]:
newer_dict = clean_dict_values(prompt_results_dict)
newer_dict

{'PetalsCorollaShape': ['tubular'],
 'PetalsCorollaColour': ["The trait 'Petals / corolla colour' is not explicitly mentioned in the text. Therefore",
  'none of the values mentioned apply.'],
 'SepalsCalyxShape': ['cup-shaped', 'deeply 5-lobed.'],
 'LeafPosition': ['alternate', 'opposite'],
 'LeafMargin': ['entire'],
 'InflorescenceType': ['panicle'],
 'SeedColour': ['None of the above'],
 'LeafGlands': ['None of the above'],
 'Stipules': ['None of the above'],
 'SepalsCalyxNumer': ["'4-5' describes the 'Sepals / calyx numer' trait mentioned in the text."],
 'FruitColour': ['green'],
 'LeafUpperSide': ['Dark green',
  'shiny',
  'glabrous. The text mentions that the upper side of the leaves are "smooth',
  'thick',
  'and leathery with a dark green topside".'],
 'Thornsspines': ['None of the above'],
 'LifeForm': ['tree'],
 'LeafRachis': ['None of the above'],
 'LeafShape': ["['elliptic", "obovate']"],
 'FruitShape': ["'Capsule elliptical",
  'flattened',
  '2.5–3 cm long',
  "often s

In [59]:
newer_dict = string_to_list(prompt_results_dict)
newer_dict

{'PetalsCorollaShape': ['None of the above.'],
 'PetalsCorollaColour': ["'dark green above",
  'silvery hairy below',
  "may have salt deposits on their upper surfaces due to salt excretion'"],
 'SepalsCalyxShape': ['None of the above.'],
 'LeafPosition': ["The correct value for the 'Leaf composition' trait mentioned in the text is 'opposite'."],
 'LeafMargin': ["The values that correctly describe the 'Leaf composition' trait mentioned in the text are: 'entire'."],
 'InflorescenceType': ["None of the above. The text does not provide any of the mentioned values as descriptions of the 'Leaf composition' trait."],
 'SeedColour': ["'dark green above'",
  "'silver-white underneath'",
  "'whitish from salt excreted at night and on cloudy days'"],
 'LeafGlands': ["'glands near basis and leaf margin'"],
 'Stipules': ['present'],
 'SepalsCalyxNumer': ["'2-3'"],
 'FruitColour': ["'dark green above",
  'silvery hairy below',
  "may have salt deposits on their upper surfaces due to salt excretion.