In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import glob
import json
import csv

In [13]:
def create_clean_paragraphs(input_dict):

    # Create a new dictionary to store the cleaned-up values
    output_dict = {}
    
    # Loop through the keys and values of the input dictionary
    for key, value in input_dict.items():
        # Convert the list of values to a set to remove duplicates
        unique_values = set(value)
        
        # Join the sentences together into a single string
        combined_string = ' '.join(unique_values)
        
        # Add the cleaned-up string to the output dictionary
        output_dict[key] = combined_string
    
    # Return the cleaned-up dictionary
    return output_dict

In [4]:
folder = "../../../data/surveys/responses/"
response_list = glob.glob(F"{folder}*")

folder = "../../../data/surveys/surveys/"
survey_list = [F"{folder}{lst[59:]}" for lst in response_list]

df_survey = pd.read_csv(survey_list[0], sep='\t')
df_survey.rename(columns={ df_survey.columns[0]: "Index" }, inplace = True)
df_response = pd.read_csv(response_list[0], header=None).iloc[: , 1:].T
df_response.rename(columns={ df_response.columns[0]: "Sentence", 
                            df_response.columns[1]: "Result" }, 
                            inplace = True)


# Surveys
df_surveys = pd.concat((pd.read_csv(f, sep='\t') for f in survey_list), ignore_index=True)
df_surveys.rename(columns={ df_surveys.columns[0]: "Index" }, inplace = True)

# Response
df_responses = pd.concat((pd.read_csv(f, header=None).iloc[: , 1:].T for f in response_list))
df_responses.rename(columns={ df_responses.columns[0]: "Sentence", 
                            df_responses.columns[1]: "Result" }, 
                            inplace = True)
# Melt
df_melt = pd.melt(df_surveys, id_vars=["Species",
                                       "Main Trait",
                                       "SIM",
                                       "Dataset"],
                             value_vars=["1", "2", "3", "4", "5",],
                             value_name="Sentence"
                
)

df_melt = df_melt.dropna()

# Drop duplicates in each dataframe
df_responses = df_responses.drop_duplicates()
df_melt = df_melt.drop_duplicates()

# Merge the dataframes based on the 'Sentence' column
df = pd.merge(df_melt, df_responses, on='Sentence')
df = df.drop(columns=["variable"])

In [9]:
paragraph_folder = "../../../data/OpenAI/DescriptionSnippets/Paragraphs/"
sentence_folder = "../../../data/OpenAI/DescriptionSnippets/Sentences/"

caribbean_jsons_paras = glob.glob(F"{paragraph_folder}c*")
caribbean_jsons_sents = glob.glob(F"{sentence_folder}c*")

caribbean_jsons_paras.sort()
caribbean_jsons_sents.sort()

In [11]:
json_file = caribbean_jsons_paras[0]
with open(json_file, 'r') as f:
    caribbean_species_paragraph = json.load(f)

In [14]:
create_clean_paragraphs(caribbean_species_paragraph)

{'Amyris ignea': 'In small scales, some families can show distinct distribution patterns, as that found for Lecythidaceae in French Guyana [32]. Mol. Cell. Opin. Microb. Front. 4 Altmetric Hop flowers are densely covered by glandular trichomes, specialized structures that secrete secondary metabolites into epidermal outgrowths3.'}