In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import glob
import csv

# Read Data

In [2]:
# Collect Survey csv files
folder = "../../../data/surveys/responses/"
response_list = glob.glob(F"{folder}*")
folder = "../../../data/surveys/surveys/"
survey_list = [F"{folder}{lst[59:]}" for lst in response_list]

In [3]:
# Surveys
df_surveys = pd.concat((pd.read_csv(f, sep='\t') for f in survey_list), ignore_index=True)
df_surveys.rename(columns={ df_surveys.columns[0]: "Index" }, inplace = True)

# Response
df_responses = pd.concat((pd.read_csv(f, header=None).iloc[: , 1:].T for f in response_list))
df_responses.rename(columns={ df_responses.columns[0]: "Sentence", 
                            df_responses.columns[1]: "Result" }, 
                            inplace = True)

## Testing

In [41]:
for survey, response in zip(survey_list, response_list):

    # Survey
    df_survey = pd.read_csv(survey, sep='\t') 

    # Response
    df_response = pd.read_csv(response, header=None).iloc[: , 1:].T
    df_response.columns = ['Sentence', 'Result']

    df_survey = pd.melt(df_survey, 
                        id_vars=["Species", "Main Trait", "SIM", "Dataset"],
                        value_vars=["1", "2", "3", "4", "5",],
                        value_name="Sentence")\
                    .dropna()\
                    .drop(columns=["variable"])
    
    df_merged = df_survey.merge(df_response, on='Sentence')
    try:
        df = pd.concat([df, df_merged], ignore_index=True, sort=False)
    except:
        df = df_merged

Unnamed: 0,Species,Main Trait,SIM,Dataset,Sentence,Result
0,Gardenia sokotensis,flower color,Jacc,PlantNet,Flowers white Provided by:.,Can infer correct Value
1,Lannea fruticosa,leaf apex,Bert,PlantNet,"Calyx-lobes ovate, 1 to 1.5 mm.",None of the above
2,Xeroderris stuhlmannii,fruit,Jacc,PlantNet,"Pod 6-25 cm, thickened over the seeds, hairles...",Can infer correct Value
3,Strychnos spinosa,petiole,Bert,PlantNet,"Leaves elliptic, ovate to almost circular, 1.5...",Can infer correct Value
4,Keetia venosa,vein,Bert,PlantNet,"Stipules triangular,abruptly tapering into a l...",None of the above
...,...,...,...,...,...,...
1387,Eucalyptus tereticornis,leaf blade,Bert,PlantNet,"Leaves simple and entire, stipules absent, pet...",Can infer correct Value
1388,Maerua angolensis,plant type,Jacc,PlantNet,The tree has a rounded crown and smooth grey b...,Can infer correct Value
1389,Cordia africana,vein,Bert,PlantNet,"Leaflets are thin and smooth, up to 45 x 20 mm.",Can infer correct Entity
1390,Aphania senegalensis,leaf apex,Bert,PlantNet,Flowers with slender pedicels 1 to 2 mm.,None of the above


In [29]:
df_sur = pd.read_csv(survey_list[0], sep='\t') 

In [36]:
df_rep = pd.read_csv(response_list[0], header=None).iloc[: , 1:].T
df_rep.columns = ['Sentence', 'Result']
df_rep

Unnamed: 0,Sentence,Result
1,Flowers white Provided by:.,Can infer correct Value
2,"Calyx-lobes ovate, 1 to 1.5 mm.",None of the above
3,Petals oblong-elliptic 3 to 4 mm.,None of the above
4,Leaves clustered at the ends of the branchlets...,Can infer correct Entity
5,Young parts with reddish floccose indumentum.,None of the above
...,...,...
65,"Petals yellow, 2 to 3.5 cm.",None of the above
66,"Stamens 10, filaments of 3 lower each with an ...",None of the above
67,Young branchlets rather densely pubescent.,None of the above
68,"Pods cylindrical, 30 to 90 cm.",None of the above


In [33]:
# Melt
df_sur = pd.melt(df_sur, id_vars=["Species",
                                       "Main Trait",
                                       "SIM",
                                       "Dataset"],
                             value_vars=["1", "2", "3", "4", "5",],
                             value_name="Sentence"
                )\
                .dropna()\
                .drop(columns=["variable"])

df_sur

Unnamed: 0,Species,Main Trait,SIM,Dataset,Sentence
0,Gardenia sokotensis,flower color,Jacc,PlantNet,Flowers white Provided by:.
1,Lannea fruticosa,leaf apex,Bert,PlantNet,"Calyx-lobes ovate, 1 to 1.5 mm."
2,Xeroderris stuhlmannii,fruit,Jacc,PlantNet,"Pod 6-25 cm, thickened over the seeds, hairles..."
3,Strychnos spinosa,petiole,Bert,PlantNet,"Leaves elliptic, ovate to almost circular, 1.5..."
4,Keetia venosa,vein,Bert,PlantNet,"Stipules triangular,abruptly tapering into a l..."
...,...,...,...,...,...
93,Flacourtia indica,leaf base,Jacc,PlantNet,"Disk lobulate, clasping the base of the ovoid ..."
95,Swartzia madagascariensis,leaf apex,Bert,PlantNet,"Stamens probably 50 to 60, exceeded by the pet..."
96,Grewia flavescens,leaf margin,Bert,PlantNet,"Leaves broadlyelliptic, ovate, oblong, obovate..."
97,Hymenocardia acida,leaf margin,Bert,PlantNet,Petioles 0.2 to 1.6 cm long.


In [39]:
df_merg = df_sur.merge(df_rep, on='Sentence')
df_merg

Unnamed: 0,Species,Main Trait,SIM,Dataset,Sentence,Result
0,Gardenia sokotensis,flower color,Jacc,PlantNet,Flowers white Provided by:.,Can infer correct Value
1,Lannea fruticosa,leaf apex,Bert,PlantNet,"Calyx-lobes ovate, 1 to 1.5 mm.",None of the above
2,Xeroderris stuhlmannii,fruit,Jacc,PlantNet,"Pod 6-25 cm, thickened over the seeds, hairles...",Can infer correct Value
3,Strychnos spinosa,petiole,Bert,PlantNet,"Leaves elliptic, ovate to almost circular, 1.5...",Can infer correct Value
4,Keetia venosa,vein,Bert,PlantNet,"Stipules triangular,abruptly tapering into a l...",None of the above
...,...,...,...,...,...,...
64,Flacourtia indica,leaf base,Jacc,PlantNet,"Disk lobulate, clasping the base of the ovoid ...",None of the above
65,Swartzia madagascariensis,leaf apex,Bert,PlantNet,"Stamens probably 50 to 60, exceeded by the pet...",None of the above
66,Grewia flavescens,leaf margin,Bert,PlantNet,"Leaves broadlyelliptic, ovate, oblong, obovate...",Can infer correct Quality
67,Hymenocardia acida,leaf margin,Bert,PlantNet,Petioles 0.2 to 1.6 cm long.,None of the above


In [40]:
for sentence in df_merg['Result']:
    print(sentence)

Can infer correct Value
None of the above
Can infer correct Value
Can infer correct Value
None of the above
Can infer correct Value
Can infer correct Entity
None of the above
Can infer correct Value
Can infer correct Entity
Can infer correct Value
Can infer correct Entity
Can infer correct Entity
None of the above
None of the above
Can infer correct Entity
Can infer correct Entity
None of the above
Can infer correct Entity
Can infer correct Value
None of the above
Can infer correct Entity
Can infer correct Entity
Can infer correct Value
Can infer correct Entity
Can infer correct Entity
Can infer correct Value
Can infer correct Entity
Can infer correct Entity
None of the above
None of the above
None of the above
None of the above
None of the above
Can infer correct Entity
None of the above
None of the above
Can infer correct Value
Can infer correct Value
Can infer correct Entity
None of the above
None of the above
None of the above
None of the above
Can infer correct Value
Can infer cor

## Before

In [4]:
# Melt
df_surveys = pd.melt(df_surveys, id_vars=["Species",
                                       "Main Trait",
                                       "SIM",
                                       "Dataset"],
                             value_vars=["1", "2", "3", "4", "5",],
                             value_name="Sentence"
                )\
                .dropna()\
                .drop(columns=["variable"])

In [5]:
# Merge the dataframes based on the 'Sentence' column
# df = pd.merge(df_surveys, df_responses, 
#               on='Sentence', 
#               how='inner'
#               )

df = df_surveys.sort_values(by=['Sentence'])
df["Result"] = df_responses.sort_values(by=['Sentence'])["Result"].values


In [12]:
df = df.drop_duplicates()

In [13]:
folder = "../../../data/surveys/"
df.to_csv(F"{folder}df_survey_result.csv")