In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import glob
import json
import csv
import collections

In [2]:
def create_clean_paragraphs(input_dict):

    # Create a new dictionary to store the cleaned-up values
    output_dict = {}
    
    # Loop through the keys and values of the input dictionary
    for key, value in input_dict.items():
        # Convert the list of values to a set to remove duplicates
        unique_values = set(value)
        
        # Join the sentences together into a single string
        combined_string = ' '.join(unique_values)
        
        # Add the cleaned-up string to the output dictionary
        output_dict[key] = combined_string
    
    # Return the cleaned-up dictionary
    return output_dict

In [3]:
folder = "../../../data/surveys/responses/"
response_list = glob.glob(F"{folder}*")

folder = "../../../data/surveys/surveys/"
survey_list = [F"{folder}{lst[59:]}" for lst in response_list]

df_survey = pd.read_csv(survey_list[0], sep='\t')
df_survey.rename(columns={ df_survey.columns[0]: "Index" }, inplace = True)
df_response = pd.read_csv(response_list[0], header=None).iloc[: , 1:].T
df_response.rename(columns={ df_response.columns[0]: "Sentence", 
                            df_response.columns[1]: "Result" }, 
                            inplace = True)


# Surveys
df_surveys = pd.concat((pd.read_csv(f, sep='\t') for f in survey_list), ignore_index=True)
df_surveys.rename(columns={ df_surveys.columns[0]: "Index" }, inplace = True)

# Response
df_responses = pd.concat((pd.read_csv(f, header=None).iloc[: , 1:].T for f in response_list))
df_responses.rename(columns={ df_responses.columns[0]: "Sentence", 
                            df_responses.columns[1]: "Result" }, 
                            inplace = True)
# Melt
df_melt = pd.melt(df_surveys, id_vars=["Species",
                                       "Main Trait",
                                       "SIM",
                                       "Dataset"],
                             value_vars=["1", "2", "3", "4", "5",],
                             value_name="Sentence"
                
)

df_melt = df_melt.dropna()

# Drop duplicates in each dataframe
df_responses = df_responses.drop_duplicates()
df_melt = df_melt.drop_duplicates()

# Merge the dataframes based on the 'Sentence' column
df = pd.merge(df_melt, df_responses, on='Sentence')
df = df.drop(columns=["variable"])

In [4]:
paragraph_folder = "../../../data/OpenAI/DescriptionSnippets/Paragraphs/"
sentence_folder = "../../../data/OpenAI/DescriptionSnippets/Sentences/"

caribbean_jsons_paras = glob.glob(F"{paragraph_folder}c*")
caribbean_jsons_sents = glob.glob(F"{sentence_folder}c*")

caribbean_jsons_paras.sort()
caribbean_jsons_sents.sort()

In [38]:
caribbean_species_text_dict = collections.defaultdict(list)

for json_file in caribbean_jsons_paras:
    with open(json_file, 'r') as f:
        caribbean_species_paragraph = json.load(f)
        caribbean_species_text_dict.update(caribbean_species_paragraph)

In [37]:
df[(df["Dataset"] == "Caribbean")]['Species'].unique()

array(['Guaiacum sanctum', 'Bursera simaruba', 'Crossopetalum rhacoma',
       'Randia aculeata', 'Vachellia tortuosa',
       'Pithecellobium unguis-cati', 'Libidibia coriaria',
       'Guaiacum officinale', 'Avicennia germinans',
       'Schoepfia schreberi', 'Trichilia trifolia',
       'Hippomane mancinella', 'Bourreria succulenta',
       'Handroanthus billbergii', 'Coccoloba uvifera',
       'Rhizophora mangle', 'Clusia rosea', 'Conocarpus erectus',
       'Coccoloba swartzii', 'Laguncularia racemosa', 'Jacquinia arborea',
       'Casearia tremula', 'Bursera tomentosa', 'Vitex cymosa',
       'Cynophalla flexuosa', 'Vitex compressa',
       'Zanthoxylum monophyllum'], dtype=object)

In [40]:
df[(df["Species"] == "Bourreria succulenta")]

Unnamed: 0,Species,Main Trait,SIM,Dataset,Sentence,Result
194,Bourreria succulenta,Fruit type,Jacc,Caribbean,The fruit is a.berry that turns orange-red at ...,Can infer correct Value
195,Bourreria succulenta,Fruit type,Jacc,Caribbean,The fruit is a.berry that turns orange-red at ...,Can infer correct Entity
196,Bourreria succulenta,Fruit shape,Jacc,Caribbean,The fruit is a.berry that turns orange-red at ...,Can infer correct Value
197,Bourreria succulenta,Fruit shape,Jacc,Caribbean,The fruit is a.berry that turns orange-red at ...,Can infer correct Entity
198,Bourreria succulenta,Fruit colour,Bert,Caribbean,The fruit is a.berry that turns orange-red at ...,Can infer correct Value
...,...,...,...,...,...,...
918,Bourreria succulenta,Petals / corolla number,Jacc,Caribbean,The 5 stamens are fused to the corolla and are...,Can infer correct Entity
919,Bourreria succulenta,Petals / corolla colour,Jacc,Caribbean,The 5 stamens are fused to the corolla and are...,Can infer correct Value
920,Bourreria succulenta,Petals / corolla colour,Jacc,Caribbean,The 5 stamens are fused to the corolla and are...,Can infer correct Entity
1120,Bourreria succulenta,Stamen shape,Bert,Caribbean,"Calyx green, bell-shaped, 4-5 mm long, with 5 ...",Can infer correct Quality


In [30]:
caribbean_species_text_dict

defaultdict(list,
            {'Amyris ignea': ['Front.',
              'Microb.',
              'In small scales, some families can show distinct distribution patterns, as that found for Lecythidaceae in French Guyana [32].',
              'Microb.',
              'Microb.',
              'Opin.',
              'Front.',
              'Hop flowers are densely covered by glandular trichomes, specialized structures that secrete secondary metabolites into epidermal outgrowths3.',
              'Mol. Cell.',
              'Microb.',
              'Front.',
              '4 Altmetric'],
             'Avicennia germinans': ['In contrast, trees in Texas and Louisiana were often less than one meter tall.',
              'The heartwood is dark-brown to black, while the sapwood is yellow-brown.',
              'The leaves often appear whitish from the salt excreted at night and on cloudy days.',
              'Leaves are simple and opposite and grow from 2 to 3 inches long. The leaf is oval and