In [1]:
import json
import numpy as np
import pandas as pd
import glob
import re
from tqdm import tqdm
import os.path
import collections
from collections import defaultdict


In [18]:
def remove_newlines(d: dict) -> dict:
    """
    Removes newline characters from string values in a dictionary, and converts any 'None of the above' values to a list
    with a single element 'None of the above'. Also strips square brackets from values that are lists and splits them
    into separate string elements.
    Args:
        d: The input dictionary to process.

    Returns:
        The processed dictionary with newlines removed and 'None of the above' values as lists.

    """
    for k, v in d.items():
        d[k] = v.replace('\n', '').strip()
        if d[k] == 'None of the above.':
            d[k] = ['None of the above']
        else:
            d[k] = [val.strip() for val in d[k].strip('[]').split(',')]
    return d


def strip_quotes(d: dict) -> dict:
    """
    Removes quotes around string values in a dictionary, and converts lists of quoted string values into lists of
    unquoted string values.
    Args:
        d: The input dictionary to process.

    Returns:
        The processed dictionary with quotes removed from string values and lists of unquoted string values.
    """
    new_dict = {}
    for k, v in d.items():
        if isinstance(v, list):
            new_list = []
            for item in v:
                if isinstance(item, str):
                    new_list.append(item.strip().strip("'"))
                else:
                    new_list.append(item)
            new_dict[k] = new_list
        else:
            new_dict[k] = v.strip().strip("'")
    return new_dict

def extract_name_and_file(path):
    filename = os.path.basename(path)
    name, ext = os.path.splitext(filename)
    species_name = path.split('/')[-2]
    return species_name, name

def split_on_capitals(string):
    # Split the string at every capital letter
    split_string = re.findall('[A-Z][^A-Z]*', string)
    # Join the split string with spaces
    new_string = ' '.join(split_string)
    
    return new_string


def clean_text(text):
    # Remove slashes and spaces around them
    text = re.sub(r'\s*/\s*', '', text)
    # Remove any remaining spaces
    text = re.sub(r'\s+', '', text)
    return text



### Open Data

In [3]:
folder_traits = "../../../data/OpenAI/Traits/"

with open(F"{folder_traits}Andrei.json", 'r') as f:
  caribbean_traits_dict = json.load(f)

In [4]:
# Define the path where the prompt results are stored
folder_prompts = "../../../data/OpenAI/PromptsResults/"

# Get a list of all folders in the path
species_folders = glob.glob(F"{folder_prompts}*")

# Create a defaultdict to store the prompt results for each species
prompt_results_dict = collections.defaultdict(dict)

# Loop through each species folder
for idx, species_folder in enumerate(species_folders):
    # Extract the species name from the folder path
    species_name = species_folder[36:60].replace('_', ' ')
    
    # Get a list of all JSON files in the folder
    json_list = glob.glob(F"{species_folder}/*")

    # Loop through each JSON file
    for json_file in (pbar := tqdm(json_list, leave=False, position=0)):
        # Update the progress bar description
        pbar.set_description(f"{idx}: {species_name}")

        # Create an empty dictionary to store the trait results
        trait_dict = {}

        # Extract the trait name and file name from the JSON file path
        _, trait = extract_name_and_file(json_file)
        # Clean up the trait name by splitting on capital letters
        trait = split_on_capitals(trait)

        # Read in the JSON file
        with open(json_file, 'r') as f:
            prompt_result = json.load(f)

        # Extract the prompt result for the trait and add it to the trait dictionary
        trait_dict[trait] = prompt_result['choices'][0]['message']['content']
        # Clean up the trait dictionary by removing newlines and quotes
        trait_dict = remove_newlines(trait_dict)
        trait_dict = strip_quotes(trait_dict)

        # print('asda, ' , trait_dict)

        # Add the trait dictionary to the list of prompt results for the species
        trait_name = list(trait_dict.keys())[0]
        trait_name_cap = trait_name.capitalize()
        # print(trait_dict[trait_name])
        prompt_results_dict[species_name][trait_name_cap] = trait_dict[trait_name] # Contains a list


                                                               

In [None]:
prompt_results_dict

### Dataframe with GT

In [5]:
root = "../../../data/OpenAI/DataFrames/"

file = "DF_Andrei.csv"
df_Andrei = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Andrei_species = list(df_Andrei.index)


file = "DF_Daniel.csv"
df_Daniel = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')\
    .dropna()
df_Daniel_species = list(df_Daniel.index)

file = "DF_Pierre.csv"
df_Pierre = pd.read_csv(F"{root}{file}", header=[0, 1], index_col=0)\
    .rename_axis('Species', axis='index')
df_Pierre_species = list(df_Pierre.index)

In [32]:
def extract_GT_traits(species, df):

    s = df.loc[species]
    GT_traits = list(s.where(s == 1).dropna().index)

    return GT_traits

def extract_ALL_traits(species, df):

    s = df.loc[species]
    traits = list(s.index)

    # Create a dict
    result = {}
    for key, value in traits:
        if key in result:
            result[key].append(value)
        else:
            result[key] = [value]

    return result


In [58]:
species = "Amyris ignea"
df_Andrei_ChatGPT = pd.DataFrame(index=pd.MultiIndex.from_tuples(df_Andrei.columns)).T

ground_truth_traits = extract_ALL_traits(species, df_Andrei)
for trait, value in ground_truth_traits.items():
    # print(trait)
    try:
        ChatGPT_result = prompt_results_dict[species][trait]
        correct_values = list(set(value) & set(ChatGPT_result))
        # print(prompt_results_dict[species][trait])
        # print(value)
        # print(correct_values, trait)
        for correct_value in correct_values:
            df_Andrei_ChatGPT.loc[species, (trait, correct_value)] = 1 
    except:
        continue


In [57]:
df_Andrei_ChatGPT

Unnamed: 0_level_0,Life form,Life form,Leaf position,Leaf position,Leaf position,Leaf position,Leaf position,Leaf composition,Leaf composition,Leaf composition,...,Aril colour,Aril colour,Aril colour,Aril colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour,Seed colour
Unnamed: 0_level_1,liana,tree,alternate,"alternate, opposite",opposite,"opposite, whorls of 3","opposite, whorls of 3, alternate",3 palmate,3-5 palmate,3-5 pinnate,...,orange,red,white,yellow-geen,black,brown,green,grey,white,whitish
Amyris ignea,,1.0,,,,,,,,,...,,,,,,,,,,


In [7]:
prompt_results_dict

defaultdict(dict,
            {'Amyris ignea': {'Petals corolla shape': ['tubular'],
              'Petals corolla colour': ['yellow',
               'white',
               'pink',
               'white',
               'pink',
               'purple'],
              'Sepals calyx shape': ['None of the above'],
              'Leaf position': ['None of the above'],
              'Leaf margin': ['crenate', 'serrate'],
              'Inflorescence type': ['panicle'],
              'Seed colour': ['None of the above'],
              'Leaf glands': ['translucent oil cells'],
              'Stipules': ['None of the above (Information about the Stipules trait was not mentioned in the text.)'],
              'Sepals calyx numer': ['4-5'],
              'Fruit colour': ['green'],
              'Leaf upper side': ['glabrous', 'pubescent'],
              'Thornsspines': ['absent'],
              'Life form': ['tree'],
              'Leaf rachis': ['None of the above'],
              'Leaf shape'

In [None]:
df_Andrei[["Life form", "Leaf position"]].head(2)

In [None]:
df = pd.DataFrame.from_dict(prompt_results_dict, orient='index')
# df = df.apply(pd.Series.explode)
df[["Petals corolla shape", "Petals corolla colour"]].head(5)

In [None]:
for col in df.columns:
    df = df.explode(col)

In [None]:
df.drop_duplicates()

In [None]:
df.columns

In [None]:
dummies = pd.get_dummies(df, prefix=df.columns, prefix_sep='')
dummies