In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import glob
import json
import csv
import collections

In [2]:
def create_clean_paragraphs(input_dict):

    # Create a new dictionary to store the cleaned-up values
    output_dict = {}
    
    # Loop through the keys and values of the input dictionary
    for key, value in input_dict.items():
        # Convert the list of values to a set to remove duplicates
        unique_values = set(value)
        
        # Join the sentences together into a single string
        combined_string = ' '.join(unique_values)
        
        # Add the cleaned-up string to the output dictionary
        output_dict[key] = combined_string
    
    # Return the cleaned-up dictionary
    return output_dict

In [3]:
folder_survey = "../../../data/surveys/"
file = "df_survey_result.csv"
df = pd.read_csv(F"{folder_survey}{file}")
df = df.set_index("Species")

### Caribbean

In [4]:
paragraph_folder = "../../../data/OpenAI/DescriptionSnippets/Paragraphs/"
sentence_folder = "../../../data/OpenAI/DescriptionSnippets/Sentences/"

caribbean_jsons_paras = glob.glob(F"{paragraph_folder}c*")
caribbean_jsons_sents = glob.glob(F"{sentence_folder}c*")

caribbean_jsons_paras.sort()
caribbean_jsons_sents.sort()

In [5]:
caribbean_species_text_dict = collections.defaultdict(list)

for json_file in caribbean_jsons_paras:
    with open(json_file, 'r') as f:
        caribbean_species_paragraph = json.load(f)

        try:
            species = list(caribbean_species_paragraph.keys())[0]
            text = " ".join(caribbean_species_paragraph[species])
            caribbean_species_text_dict[species] = text
        except:
            continue

In [6]:
folder_prompt_results = "../../../data/OpenAI/PromptsAnalysesData/OriginalData/"

df_Andrei_ChatGPT = pd.read_csv(F"{folder_prompt_results}caribbean_df_orig_ChatGPT.csv",
                                header=[0, 1],
                                index_col=0)
df_Andrei_GT = pd.read_csv(F"{folder_prompt_results}caribbean_df_orig_GT.csv",
                           header=[0, 1],
                           index_col=0)

In [7]:
missing_traits_dict = {}

# Loop over each row in the DataFrame
for index, row in df_Andrei_ChatGPT.iterrows():
    # Get the column names where the row contains a NaN value
    nan_columns = row.index[row.isna()]
    # Add the index and nan_columns to the result dictionary
    missing_traits_dict[index] = nan_columns.get_level_values(0).unique().tolist()

In [8]:
survey_result = []

for species, traits in missing_traits_dict.items():
    for trait in traits:
        res = df[(df.index == species) &
                 (df["Main Trait"] == trait)][["Sentence",'Result']].values
        if res.any():
            for r in res:
                survey_result.append((species, trait, *r))

df_missing = pd.DataFrame(survey_result, columns=['Species', 'Trait', 'Sentence', 'Result'])        

In [12]:
df['Result'].unique()

array(['Can infer correct Entity', 'None of the above',
       'Can infer correct Value', 'Can infer correct Quality'],
      dtype=object)

In [9]:
df_missing

Unnamed: 0,Species,Trait,Sentence,Result
0,Bourreria succulenta,Leaf glands,"Leaf blades 5-15 x 4-8 cm, elliptic, oblong, o...",Can infer correct Value
1,Bourreria succulenta,Leaf glands,Leaf stems are yellow-green.,None of the above


In [10]:
df_missing.groupby(['Species', 'Trait', 'Result']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sentence
Species,Trait,Result,Unnamed: 3_level_1
Bourreria succulenta,Leaf glands,Can infer correct Value,1
Bourreria succulenta,Leaf glands,None of the above,1
